aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/defines.h74
-rw-r--r--native/jni/src/dictionary/header/header_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp)73
-rw-r--r--native/jni/src/dictionary/header/header_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h)127
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp)41
-rw-r--r--native/jni/src/dictionary/header/header_read_write_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h)11
-rw-r--r--native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h)0
-rw-r--r--native/jni/src/dictionary/interface/dictionary_header_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h)0
-rw-r--r--native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h)0
-rw-r--r--native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h)55
-rw-r--r--native/jni/src/dictionary/interface/ngram_listener.h (renamed from native/jni/src/suggest/core/dictionary/ngram_listener.h)4
-rw-r--r--native/jni/src/dictionary/property/historical_info.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h)5
-rw-r--r--native/jni/src/dictionary/property/ngram_context.cpp123
-rw-r--r--native/jni/src/dictionary/property/ngram_context.h78
-rw-r--r--native/jni/src/dictionary/property/ngram_property.h62
-rw-r--r--native/jni/src/dictionary/property/unigram_property.h137
-rw-r--r--native/jni/src/dictionary/property/word_attributes.h68
-rw-r--r--native/jni/src/dictionary/property/word_property.h (renamed from native/jni/src/suggest/core/dictionary/property/word_property.h)32
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/Readme.txt (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt)0
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp)35
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp)12
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h)2
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp)21
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h)8
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h)12
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h)14
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp)8
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp)8
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h)18
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h)2
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp)25
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp)64
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h)17
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp)421
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h)88
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp)6
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h)2
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp)47
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp)10
-rw-r--r--native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h)4
-rw-r--r--native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp)52
-rw-r--r--native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h)8
-rw-r--r--native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp)35
-rw-r--r--native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h)10
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp)10
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h)7
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp)17
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h)15
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h)0
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp)140
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h)47
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h)2
-rw-r--r--native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp)33
-rw-r--r--native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h)25
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h)0
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_params.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h)29
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h)2
-rw-r--r--native/jni/src/dictionary/structure/pt_common/pt_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h)6
-rw-r--r--native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp)20
-rw-r--r--native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h)13
-rw-r--r--native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h)19
-rw-r--r--native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp)264
-rw-r--r--native/jni/src/dictionary/structure/v2/patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h)91
-rw-r--r--native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h)18
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp)16
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h)20
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp)18
-rw-r--r--native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h)9
-rw-r--r--native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp34
-rw-r--r--native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h77
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp478
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h258
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp32
-rw-r--r--native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h101
-rw-r--r--native/jni/src/dictionary/structure/v4/content/probability_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h)93
-rw-r--r--native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h)14
-rw-r--r--native/jni/src/dictionary/structure/v4/content/single_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h)13
-rw-r--r--native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h)21
-rw-r--r--native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp)7
-rw-r--r--native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h)10
-rw-r--r--native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h)8
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp)46
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h)28
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp)36
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_dict_constants.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h)20
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp)36
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h)15
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp)135
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h)33
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp603
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h)87
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp)4
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h)0
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp185
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h)59
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp)8
-rw-r--r--native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h)2
-rw-r--r--native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h (renamed from native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h)2
-rw-r--r--native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h (renamed from native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h)10
-rw-r--r--native/jni/src/dictionary/utils/bloom_filter.h (renamed from native/jni/src/suggest/core/dictionary/bloom_filter.h)0
-rw-r--r--native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp)8
-rw-r--r--native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h)2
-rw-r--r--native/jni/src/dictionary/utils/byte_array_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp)2
-rw-r--r--native/jni/src/dictionary/utils/byte_array_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h)27
-rw-r--r--native/jni/src/dictionary/utils/dict_file_writing_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp)24
-rw-r--r--native/jni/src/dictionary/utils/dict_file_writing_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h)4
-rw-r--r--native/jni/src/dictionary/utils/entry_counters.h89
-rw-r--r--native/jni/src/dictionary/utils/file_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp)2
-rw-r--r--native/jni/src/dictionary/utils/file_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h)0
-rw-r--r--native/jni/src/dictionary/utils/forgetting_curve_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp)91
-rw-r--r--native/jni/src/dictionary/utils/forgetting_curve_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h)27
-rw-r--r--native/jni/src/dictionary/utils/format_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp)28
-rw-r--r--native/jni/src/dictionary/utils/format_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h)14
-rw-r--r--native/jni/src/dictionary/utils/mmapped_buffer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp)4
-rw-r--r--native/jni/src/dictionary/utils/mmapped_buffer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h)0
-rw-r--r--native/jni/src/dictionary/utils/multi_bigram_map.cpp (renamed from native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp)55
-rw-r--r--native/jni/src/dictionary/utils/multi_bigram_map.h (renamed from native/jni/src/suggest/core/dictionary/multi_bigram_map.h)26
-rw-r--r--native/jni/src/dictionary/utils/probability_utils.cpp23
-rw-r--r--native/jni/src/dictionary/utils/probability_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h)15
-rw-r--r--native/jni/src/dictionary/utils/sparse_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp)2
-rw-r--r--native/jni/src/dictionary/utils/sparse_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h)3
-rw-r--r--native/jni/src/dictionary/utils/trie_map.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp)87
-rw-r--r--native/jni/src/dictionary/utils/trie_map.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h)19
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h60
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_utils.cpp31
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_utils.h7
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_vector.h11
-rw-r--r--native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h84
-rw-r--r--native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h6
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.cpp123
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary.h53
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary_utils.cpp32
-rw-r--r--native/jni/src/suggest/core/dictionary/dictionary_utils.h3
-rw-r--r--native/jni/src/suggest/core/dictionary/digraph_utils.cpp2
-rw-r--r--native/jni/src/suggest/core/dictionary/error_type_utils.cpp21
-rw-r--r--native/jni/src/suggest/core/dictionary/error_type_utils.h15
-rw-r--r--native/jni/src/suggest/core/dictionary/property/bigram_property.h66
-rw-r--r--native/jni/src/suggest/core/dictionary/property/unigram_property.h114
-rw-r--r--native/jni/src/suggest/core/dictionary/property/word_property.cpp84
-rw-r--r--native/jni/src/suggest/core/layout/additional_proximity_chars.cpp2
-rw-r--r--native/jni/src/suggest/core/layout/additional_proximity_chars.h27
-rw-r--r--native/jni/src/suggest/core/layout/geometry_utils.h14
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info.cpp21
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info.h14
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state.cpp4
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_state.h3
-rw-r--r--native/jni/src/suggest/core/layout/proximity_info_utils.h11
-rw-r--r--native/jni/src/suggest/core/policy/scoring.h10
-rw-r--r--native/jni/src/suggest/core/policy/traversal.h5
-rw-r--r--native/jni/src/suggest/core/policy/weighting.cpp14
-rw-r--r--native/jni/src/suggest/core/policy/weighting.h2
-rw-r--r--native/jni/src/suggest/core/result/suggestion_results.cpp7
-rw-r--r--native/jni/src/suggest/core/result/suggestion_results.h12
-rw-r--r--native/jni/src/suggest/core/result/suggestions_output_utils.cpp125
-rw-r--r--native/jni/src/suggest/core/result/suggestions_output_utils.h13
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.cpp20
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.h23
-rw-r--r--native/jni/src/suggest/core/session/prev_words_info.h162
-rw-r--r--native/jni/src/suggest/core/suggest.cpp47
-rw-r--r--native/jni/src/suggest/core/suggest.h3
-rw-r--r--native/jni/src/suggest/core/suggest_interface.h3
-rw-r--r--native/jni/src/suggest/core/suggest_options.h9
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp282
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h72
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp219
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h128
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h99
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp95
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h83
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp551
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp294
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.cpp13
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.h7
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_scoring.h55
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_traversal.h30
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp44
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.h10
-rw-r--r--native/jni/src/utils/byte_array_view.h15
-rw-r--r--native/jni/src/utils/char_utils.cpp11
-rw-r--r--native/jni/src/utils/char_utils.h37
-rw-r--r--native/jni/src/utils/int_array_view.h82
-rw-r--r--native/jni/src/utils/jni_data_utils.cpp91
-rw-r--r--native/jni/src/utils/jni_data_utils.h30
-rw-r--r--native/jni/src/utils/ngram_utils.h63
-rw-r--r--native/jni/src/utils/profiler.h86
200 files changed, 5093 insertions, 4318 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index 24d04e51f..10b930e4f 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -23,10 +23,10 @@
#define AK_FORCE_INLINE inline
#endif // __GNUC__
-#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
+#if defined(FLAG_DBG)
#undef AK_FORCE_INLINE
#define AK_FORCE_INLINE inline
-#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
+#endif // defined(FLAG_DBG)
// Must be equal to Constants.Dictionary.MAX_WORD_LENGTH in Java
#define MAX_WORD_LENGTH 48
@@ -119,7 +119,7 @@ static inline void dumpWordInfo(const int *word, const int length, const int ran
const int probability) {
static char charBuf[50];
const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf));
- if (N > 1) {
+ if (N > 0) {
AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability);
}
}
@@ -172,69 +172,6 @@ static inline void showStackTrace() {
#define INTS_TO_CHARS(input, length, output)
#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG)
-#ifdef FLAG_DO_PROFILE
-// Profiler
-#include <time.h>
-
-#define PROF_BUF_SIZE 100
-static float profile_buf[PROF_BUF_SIZE];
-static float profile_old[PROF_BUF_SIZE];
-static unsigned int profile_counter[PROF_BUF_SIZE];
-
-#define PROF_RESET prof_reset()
-#define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id]
-#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while (0)
-#define PROF_START(prof_buf_id) do { \
- PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while (0)
-#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while (0)
-#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id])
-#define PROF_CLOCKOUT(prof_buf_id) \
- AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id]))
-#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while (0)
-
-static inline void prof_reset(void) {
- for (int i = 0; i < PROF_BUF_SIZE; ++i) {
- profile_buf[i] = 0;
- profile_old[i] = 0;
- profile_counter[i] = 0;
- }
-}
-
-static inline void prof_out(void) {
- if (profile_counter[PROF_BUF_SIZE - 1] != 1) {
- AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE.");
- }
- AKLOGI("Total time is %6.3f ms.",
- profile_buf[PROF_BUF_SIZE - 1] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC));
- float all = 0.0f;
- for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
- all += profile_buf[i];
- }
- if (all < 1.0f) all = 1.0f;
- for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
- if (profile_buf[i] > 0.0f) {
- AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.",
- i, (profile_buf[i] * 100.0f / all),
- profile_buf[i] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC),
- profile_counter[i]);
- }
- }
-}
-
-#else // FLAG_DO_PROFILE
-#define PROF_BUF_SIZE 0
-#define PROF_RESET
-#define PROF_COUNT(prof_buf_id)
-#define PROF_OPEN
-#define PROF_START(prof_buf_id)
-#define PROF_CLOSE
-#define PROF_END(prof_buf_id)
-#define PROF_CLOCK_OUT(prof_buf_id)
-#define PROF_CLOCKOUT(prof_buf_id)
-#define PROF_OUTALL
-
-#endif // FLAG_DO_PROFILE
-
#ifdef FLAG_DBG
#define DEBUG_DICT true
#define DEBUG_DICT_FULL false
@@ -299,8 +236,9 @@ static inline void prof_out(void) {
#define NOT_AN_INDEX (-1)
#define NOT_A_PROBABILITY (-1)
#define NOT_A_DICT_POS (S_INT_MIN)
+#define NOT_A_WORD_ID (S_INT_MIN)
#define NOT_A_TIMESTAMP (-1)
-#define NOT_A_LANGUAGE_WEIGHT (-1.0f)
+#define NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1.0f)
// A special value to mean the first word confidence makes no sense in this case,
// e.g. this is not a multi-word suggestion.
@@ -337,7 +275,7 @@ static inline void prof_out(void) {
#define MAX_POINTER_COUNT_G 2
// (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported.
-#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 1
+#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 3
#define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \
TypeName() = delete
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp
index 6ed65d921..d4f84d39f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp
+++ b/native/jni/src/dictionary/header/header_policy.cpp
@@ -14,10 +14,12 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
+#include "dictionary/header/header_policy.h"
#include <algorithm>
+#include "utils/ngram_utils.h"
+
namespace latinime {
// Note that these are corresponding definitions in Java side in DictionaryHeader.
@@ -28,33 +30,23 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
const char *const HeaderPolicy::DATE_KEY = "date";
const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
-const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
-const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
+const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] =
+ {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"};
+const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] =
+ {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT",
+ "MAX_QUADGRAM_ENTRY_COUNT"};
+const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000};
const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
// Historical info is information that is needed to support decaying such as timestamp, level and
// count.
const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
-const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
- "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
"FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
-const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
- "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
-
-const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
-const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
-const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 2;
const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
-// 30 days
-const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
- 30 * 24 * 60 * 60;
-
-const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
-const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
// Used for logging. Question mark is used to indicate that the key is not found.
void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
@@ -100,12 +92,11 @@ bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
}
bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
- const int unigramCount, const int bigramCount,
- const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const {
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const {
int writingPos = 0;
DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
- fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount,
- extendedRegionSize, &attributeMapToWrite);
+ fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
&writingPos)) {
return false;
@@ -132,11 +123,22 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim
return true;
}
-void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount,
- const int bigramCount, const int extendedRegionSize,
+namespace {
+
+int getIndexFromNgramType(const NgramType ngramType) {
+ return static_cast<int>(ngramType);
+}
+
+} // namespace
+
+void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
+ const EntryCounts &entryCounts, const int extendedRegionSize,
DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
- HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount);
- HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount);
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ HeaderReadWriteUtils::setIntAttribute(outAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)],
+ entryCounts.getNgramCount(ngramType));
+ }
HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
extendedRegionSize);
// Set the current time as the generation time.
@@ -157,4 +159,25 @@ void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int uni
return attributeMap;
}
+/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */);
+ entryCounters.setNgramCount(ngramType, entryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
+/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const {
+ MutableEntryCounters entryCounters;
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ const int index = getIndexFromNgramType(ngramType);
+ const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
+ MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]);
+ entryCounters.setNgramCount(ngramType, maxEntryCount);
+ }
+ return entryCounters.getEntryCounts();
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h
index 87cf0cd3b..47cc9196a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h
+++ b/native/jni/src/dictionary/header/header_policy.h
@@ -20,9 +20,10 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/format_utils.h"
#include "utils/char_utils.h"
#include "utils/time_keeper.h"
@@ -45,27 +46,15 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
- mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
- UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
- mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
- BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
&mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
- mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
- DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
- mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
- DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
- mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
- mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
@@ -82,22 +71,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
- mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
+ mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
+ mExtendedRegionSize(0),
mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
&mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
- mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
- DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
&mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
- mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
- DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
- mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
- mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
- &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
+ mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
// Copy header information
HeaderPolicy(const HeaderPolicy *const headerPolicy)
@@ -108,27 +89,22 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
mIsDecayingDict(headerPolicy->mIsDecayingDict),
mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
- mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount),
+ mNgramCounts(headerPolicy->mNgramCounts),
+ mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
- mForgettingCurveOccurrencesToLevelUp(
- headerPolicy->mForgettingCurveOccurrencesToLevelUp),
mForgettingCurveProbabilityValuesTableId(
headerPolicy->mForgettingCurveProbabilityValuesTableId),
- mForgettingCurveDurationToLevelDown(
- headerPolicy->mForgettingCurveDurationToLevelDown),
- mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
- mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
+ mCodePointTable(headerPolicy->mCodePointTable) {}
// Temporary dummy header.
HeaderPolicy()
: mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
- mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
+ mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
- mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
- mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
+ mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
~HeaderPolicy() {}
@@ -138,13 +114,17 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
// same so we use them for both here.
switch (mDictFormatVersion) {
case FormatUtils::VERSION_2:
- return FormatUtils::VERSION_2;
+ case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return FormatUtils::UNKNOWN_VERSION;
+ case FormatUtils::VERSION_202:
+ return FormatUtils::VERSION_202;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
- case FormatUtils::VERSION_4:
- return FormatUtils::VERSION_4;
- case FormatUtils::VERSION_4_DEV:
- return FormatUtils::VERSION_4_DEV;
+ case FormatUtils::VERSION_402:
+ return FormatUtils::VERSION_402;
+ case FormatUtils::VERSION_403:
+ return FormatUtils::VERSION_403;
default:
return FormatUtils::UNKNOWN_VERSION;
}
@@ -186,12 +166,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return mLastDecayedTime;
}
- AK_FORCE_INLINE int getUnigramCount() const {
- return mUnigramCount;
+ AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
+ return mNgramCounts;
}
- AK_FORCE_INLINE int getBigramCount() const {
- return mBigramCount;
+ AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
+ return mMaxNgramCounts;
}
AK_FORCE_INLINE int getExtendedRegionSize() const {
@@ -211,35 +191,19 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
return &mAttributeMap;
}
- AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const {
- return mForgettingCurveOccurrencesToLevelUp;
- }
-
AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
return mForgettingCurveProbabilityValuesTableId;
}
- AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
- return mForgettingCurveDurationToLevelDown;
- }
-
- AK_FORCE_INLINE int getMaxUnigramCount() const {
- return mMaxUnigramCount;
- }
-
- AK_FORCE_INLINE int getMaxBigramCount() const {
- return mMaxBigramCount;
- }
-
void readHeaderValueOrQuestionMark(const char *const key,
int *outValue, int outValueSize) const;
bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
- const int unigramCount, const int bigramCount,
- const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const;
+ const EntryCounts &entryCounts, const int extendedRegionSize,
+ BufferWithExtendableBuffer *const outBuffer) const;
- void fillInHeader(const bool updatesLastDecayedTime,
- const int unigramCount, const int bigramCount, const int extendedRegionSize,
+ void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
+ const int extendedRegionSize,
DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
AK_FORCE_INLINE const std::vector<int> *getLocale() const {
@@ -247,7 +211,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
}
bool supportsBeginningOfSentence() const {
- return mDictFormatVersion >= FormatUtils::VERSION_4;
+ return mDictFormatVersion >= FormatUtils::VERSION_402;
+ }
+
+ const int *getCodePointTable() const {
+ return mCodePointTable;
}
private:
@@ -258,23 +226,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
static const char *const IS_DECAYING_DICT_KEY;
static const char *const DATE_KEY;
static const char *const LAST_DECAYED_TIME_KEY;
- static const char *const UNIGRAM_COUNT_KEY;
- static const char *const BIGRAM_COUNT_KEY;
+ static const char *const NGRAM_COUNT_KEYS[];
+ static const char *const MAX_NGRAM_COUNT_KEYS[];
+ static const int DEFAULT_MAX_NGRAM_COUNTS[];
static const char *const EXTENDED_REGION_SIZE_KEY;
static const char *const HAS_HISTORICAL_INFO_KEY;
static const char *const LOCALE_KEY;
static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
- static const char *const MAX_UNIGRAM_COUNT_KEY;
- static const char *const MAX_BIGRAM_COUNT_KEY;
static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
- static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
- static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
- static const int DEFAULT_MAX_UNIGRAM_COUNT;
- static const int DEFAULT_MAX_BIGRAM_COUNT;
const FormatUtils::FORMAT_VERSION mDictFormatVersion;
const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
@@ -286,20 +249,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy {
const bool mIsDecayingDict;
const int mDate;
const int mLastDecayedTime;
- const int mUnigramCount;
- const int mBigramCount;
+ const EntryCounts mNgramCounts;
+ const EntryCounts mMaxNgramCounts;
const int mExtendedRegionSize;
const bool mHasHistoricalInfoOfWords;
- const int mForgettingCurveOccurrencesToLevelUp;
const int mForgettingCurveProbabilityValuesTableId;
- const int mForgettingCurveDurationToLevelDown;
- const int mMaxUnigramCount;
- const int mMaxBigramCount;
+ const int *const mCodePointTable;
const std::vector<int> readLocale() const;
float readMultipleWordCostMultiplier() const;
bool readRequiresGermanUmlautProcessing() const;
-
+ const EntryCounts readNgramCounts() const;
+ const EntryCounts readMaxNgramCounts() const;
static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
const uint8_t *const dictBuf);
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp
index a8f8f284b..779f8b8c3 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp
+++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp
@@ -14,15 +14,16 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
+#include "dictionary/header/header_read_write_utils.h"
#include <cctype>
#include <cstdio>
+#include <memory>
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
@@ -34,12 +35,13 @@ namespace latinime {
const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11;
const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256;
-const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256;
+const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048;
const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4;
const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2;
const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4;
+const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable";
const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0;
@@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
return;
}
int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH];
- int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH];
+ std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]);
while (pos < headerSize) {
+ // The values in the header don't use the code point table for their encoding.
const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
- MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos);
+ MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos);
std::vector<int> key;
key.insert(key.end(), keyBuffer, keyBuffer + keyLength);
const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf,
- MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos);
+ MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos);
std::vector<int> value;
- value.insert(value.end(), valueBuffer, valueBuffer + valueLength);
+ value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength);
headerAttributes->insert(AttributeMap::value_type(key, value));
}
}
+/* static */ const int *HeaderReadWriteUtils::readCodePointTable(
+ AttributeMap *const headerAttributes) {
+ AttributeMap::key_type keyVector;
+ insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector);
+ AttributeMap::const_iterator it = headerAttributes->find(keyVector);
+ if (it == headerAttributes->end()) {
+ return nullptr;
+ }
+ return it->second.data();
+}
+
/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion(
BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version,
int *const writingPos) {
@@ -96,11 +110,13 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
}
switch (version) {
case FormatUtils::VERSION_2:
- // Version 2 dictionary writing is not supported.
+ case FormatUtils::VERSION_201:
+ case FormatUtils::VERSION_202:
+ // None of the static dictionaries (v2x) support writing
return false;
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
return buffer->writeUintAndAdvancePosition(version /* data */,
HEADER_DICTIONARY_VERSION_SIZE, writingPos);
default:
@@ -142,7 +158,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap;
}
/* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute(
- AttributeMap *const headerAttributes, const char *const key, const std::vector<int> value) {
+ AttributeMap *const headerAttributes, const char *const key,
+ const std::vector<int> &value) {
AttributeMap::key_type keyVector;
insertCharactersIntoVector(key, &keyVector);
(*headerAttributes)[keyVector] = value;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h
index 9b90488fc..f67d614df 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h
+++ b/native/jni/src/dictionary/header/header_read_write_utils.h
@@ -20,8 +20,8 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/utils/format_utils.h"
namespace latinime {
@@ -46,6 +46,9 @@ class HeaderReadWriteUtils {
static void fetchAllHeaderAttributes(const uint8_t *const dictBuf,
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+ static const int *readCodePointTable(
+ DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes);
+
static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer,
const FormatUtils::FORMAT_VERSION version, int *const writingPos);
@@ -64,7 +67,7 @@ class HeaderReadWriteUtils {
*/
static void setCodePointVectorAttribute(
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
- const char *const key, const std::vector<int> value);
+ const char *const key, const std::vector<int> &value);
static void setBoolAttribute(
DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes,
@@ -101,6 +104,8 @@ class HeaderReadWriteUtils {
static const int HEADER_FLAG_SIZE;
static const int HEADER_SIZE_FIELD_SIZE;
+ static const char *const CODE_POINT_TABLE_KEY;
+
// Value for the "flags" field. It's unused at the moment.
static const DictionaryFlags NO_FLAGS;
diff --git a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h
index aa0d068aa..aa0d068aa 100644
--- a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h
+++ b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h
diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h
index 6da390e55..6da390e55 100644
--- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h
+++ b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h
diff --git a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h
index 40b6c2de1..40b6c2de1 100644
--- a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h
+++ b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h
diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h
index e91f07682..ace48491d 100644
--- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h
+++ b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h
@@ -20,16 +20,20 @@
#include <memory>
#include "defines.h"
-#include "suggest/core/dictionary/property/word_property.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/word_attributes.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
+#include "utils/int_array_view.h"
namespace latinime {
class DicNode;
class DicNodeVector;
class DictionaryHeaderStructurePolicy;
-class DictionaryShortcutsStructurePolicy;
+class MultiBigramMap;
class NgramListener;
-class PrevWordsInfo;
+class NgramContext;
class UnigramProperty;
/*
@@ -47,42 +51,45 @@ class DictionaryStructureWithBufferPolicy {
virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const = 0;
- virtual int getCodePointsAndProbabilityAndReturnCodePointCount(
- const int nodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const = 0;
+ virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const = 0;
- virtual int getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const = 0;
+ virtual int getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const = 0;
- virtual int getProbability(const int unigramProbability,
- const int bigramProbability) const = 0;
+ virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const = 0;
- virtual int getProbabilityOfPtNode(const int *const prevWordsPtNodePos,
- const int nodePos) const = 0;
+ // TODO: Remove
+ virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0;
- virtual void iterateNgramEntries(const int *const prevWordsPtNodePos,
+ virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0;
+
+ virtual void iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const = 0;
- virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0;
+ virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0;
virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0;
- virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0;
-
// Returns whether the update was success or not.
- virtual bool addUnigramEntry(const int *const word, const int length,
+ virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints,
const UnigramProperty *const unigramProperty) = 0;
// Returns whether the update was success or not.
- virtual bool removeUnigramEntry(const int *const word, const int length) = 0;
+ virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0;
+
+ // Returns whether the update was success or not.
+ virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0;
// Returns whether the update was success or not.
- virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty) = 0;
+ virtual bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) = 0;
// Returns whether the update was success or not.
- virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const int *const word, const int length) = 0;
+ virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo) = 0;
// Returns whether the flush was success or not.
virtual bool flush(const char *const filePath) = 0;
@@ -97,9 +104,7 @@ class DictionaryStructureWithBufferPolicy {
virtual void getProperty(const char *const query, const int queryLength, char *const outResult,
const int maxResultLength) = 0;
- // Used for testing.
- virtual const WordProperty getWordProperty(const int *const codePonts,
- const int codePointCount) const = 0;
+ virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0;
// Method to iterate all words in the dictionary.
// The returned token has to be used to get the next word. If token is 0, this method newly
diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/dictionary/interface/ngram_listener.h
index 88b88bafb..2eb5e9fd1 100644
--- a/native/jni/src/suggest/core/dictionary/ngram_listener.h
+++ b/native/jni/src/dictionary/interface/ngram_listener.h
@@ -26,7 +26,9 @@ namespace latinime {
*/
class NgramListener {
public:
- virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos) = 0;
+ // ngramProbability is always 0 for v403 decaying dictionary.
+ // TODO: Remove ngramProbability.
+ virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0;
virtual ~NgramListener() {};
protected:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h b/native/jni/src/dictionary/property/historical_info.h
index 428ca8626..e5ce1ea25 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h
+++ b/native/jni/src/dictionary/property/historical_info.h
@@ -34,10 +34,11 @@ class HistoricalInfo {
return mTimestamp != NOT_A_TIMESTAMP;
}
- int getTimeStamp() const {
+ int getTimestamp() const {
return mTimestamp;
}
+ // TODO: Remove
int getLevel() const {
return mLevel;
}
@@ -47,7 +48,7 @@ class HistoricalInfo {
}
private:
- // Copy constructor is public to use this class as a type of return value.
+ // Default copy constructor is used for using in std::vector.
DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo);
const int mTimestamp;
diff --git a/native/jni/src/dictionary/property/ngram_context.cpp b/native/jni/src/dictionary/property/ngram_context.cpp
new file mode 100644
index 000000000..7b9c3eff6
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_context.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/property/ngram_context.h"
+
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "utils/char_utils.h"
+
+namespace latinime {
+
+NgramContext::NgramContext() : mPrevWordCount(0) {}
+
+NgramContext::NgramContext(const NgramContext &ngramContext)
+ : mPrevWordCount(ngramContext.mPrevWordCount) {
+ for (size_t i = 0; i < mPrevWordCount; ++i) {
+ mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
+ memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
+ sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
+ mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
+ }
+}
+
+NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
+ const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
+ const size_t prevWordCount)
+ : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
+ clear();
+ for (size_t i = 0; i < mPrevWordCount; ++i) {
+ if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
+ continue;
+ }
+ memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
+ sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
+ mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
+ mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
+ }
+}
+
+NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
+ const bool isBeginningOfSentence) : mPrevWordCount(1) {
+ clear();
+ if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
+ return;
+ }
+ memmove(mPrevWordCodePoints[0], prevWordCodePoints,
+ sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
+ mPrevWordCodePointCount[0] = prevWordCodePointCount;
+ mIsBeginningOfSentence[0] = isBeginningOfSentence;
+}
+
+bool NgramContext::isValid() const {
+ if (mPrevWordCodePointCount[0] > 0) {
+ return true;
+ }
+ if (mIsBeginningOfSentence[0]) {
+ return true;
+ }
+ return false;
+}
+
+const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
+ if (n <= 0 || n > mPrevWordCount) {
+ return CodePointArrayView();
+ }
+ return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
+}
+
+bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
+ if (n <= 0 || n > mPrevWordCount) {
+ return false;
+ }
+ return mIsBeginningOfSentence[n - 1];
+}
+
+/* static */ int NgramContext::getWordId(
+ const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ const int *const wordCodePoints, const int wordCodePointCount,
+ const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
+ if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
+ return NOT_A_WORD_ID;
+ }
+ int codePoints[MAX_WORD_LENGTH];
+ int codePointCount = wordCodePointCount;
+ memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
+ if (isBeginningOfSentence) {
+ codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
+ MAX_WORD_LENGTH);
+ if (codePointCount <= 0) {
+ return NOT_A_WORD_ID;
+ }
+ }
+ const CodePointArrayView codePointArrayView(codePoints, codePointCount);
+ const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
+ false /* forceLowerCaseSearch */);
+ if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
+ // Return the id when when the word was found or doesn't try lower case search.
+ return wordId;
+ }
+ // Check bigrams for lower-cased previous word if original was not found. Useful for
+ // auto-capitalized words like "The [current_word]".
+ return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
+}
+
+void NgramContext::clear() {
+ for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
+ mPrevWordCodePointCount[i] = 0;
+ mIsBeginningOfSentence[i] = false;
+ }
+}
+} // namespace latinime
diff --git a/native/jni/src/dictionary/property/ngram_context.h b/native/jni/src/dictionary/property/ngram_context.h
new file mode 100644
index 000000000..9b36199c9
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_context.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_CONTEXT_H
+#define LATINIME_NGRAM_CONTEXT_H
+
+#include <array>
+
+#include "defines.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class DictionaryStructureWithBufferPolicy;
+
+class NgramContext {
+ public:
+ // No prev word information.
+ NgramContext();
+ // Copy constructor to use this class with std::vector and use this class as a return value.
+ NgramContext(const NgramContext &ngramContext);
+ // Construct from previous words.
+ NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
+ const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
+ const size_t prevWordCount);
+ // Construct from a previous word.
+ NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
+ const bool isBeginningOfSentence);
+
+ size_t getPrevWordCount() const {
+ return mPrevWordCount;
+ }
+ bool isValid() const;
+
+ template<size_t N>
+ const WordIdArrayView getPrevWordIds(
+ const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ WordIdArray<N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const {
+ for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) {
+ prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i],
+ mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch);
+ }
+ return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount);
+ }
+
+ // n is 1-indexed.
+ const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const;
+ // n is 1-indexed.
+ bool isNthPrevWordBeginningOfSentence(const size_t n) const;
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(NgramContext);
+
+ static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
+ const int *const wordCodePoints, const int wordCodePointCount,
+ const bool isBeginningOfSentence, const bool tryLowerCaseSearch);
+ void clear();
+
+ const size_t mPrevWordCount;
+ int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
+ int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+};
+} // namespace latinime
+#endif // LATINIME_NGRAM_CONTEXT_H
diff --git a/native/jni/src/dictionary/property/ngram_property.h b/native/jni/src/dictionary/property/ngram_property.h
new file mode 100644
index 000000000..5f259ec59
--- /dev/null
+++ b/native/jni/src/dictionary/property/ngram_property.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_PROPERTY_H
+#define LATINIME_NGRAM_PROPERTY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/ngram_context.h"
+
+namespace latinime {
+
+class NgramProperty {
+ public:
+ NgramProperty(const NgramContext &ngramContext, const std::vector<int> &&targetCodePoints,
+ const int probability, const HistoricalInfo historicalInfo)
+ : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)),
+ mProbability(probability), mHistoricalInfo(historicalInfo) {}
+
+ const NgramContext *getNgramContext() const {
+ return &mNgramContext;
+ }
+
+ const std::vector<int> *getTargetCodePoints() const {
+ return &mTargetCodePoints;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo getHistoricalInfo() const {
+ return mHistoricalInfo;
+ }
+
+ private:
+ // Default copy constructor is used for using in std::vector.
+ DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty);
+ DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty);
+
+ const NgramContext mNgramContext;
+ const std::vector<int> mTargetCodePoints;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+};
+} // namespace latinime
+#endif // LATINIME_NGRAM_PROPERTY_H
diff --git a/native/jni/src/dictionary/property/unigram_property.h b/native/jni/src/dictionary/property/unigram_property.h
new file mode 100644
index 000000000..92f61b85d
--- /dev/null
+++ b/native/jni/src/dictionary/property/unigram_property.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_UNIGRAM_PROPERTY_H
+#define LATINIME_UNIGRAM_PROPERTY_H
+
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+
+namespace latinime {
+
+class UnigramProperty {
+ public:
+ class ShortcutProperty {
+ public:
+ ShortcutProperty(const std::vector<int> &&targetCodePoints, const int probability)
+ : mTargetCodePoints(std::move(targetCodePoints)),
+ mProbability(probability) {}
+
+ const std::vector<int> *getTargetCodePoints() const {
+ return &mTargetCodePoints;
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ private:
+ // Default copy constructor is used for using in std::vector.
+ DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty);
+
+ const std::vector<int> mTargetCodePoints;
+ const int mProbability;
+ };
+
+ UnigramProperty()
+ : mRepresentsBeginningOfSentence(false), mIsNotAWord(false),
+ mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY),
+ mHistoricalInfo(), mShortcuts() {}
+
+ // In contexts which do not support the Blacklisted flag (v2, v4<403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(false),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
+
+ // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(false),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts() {}
+
+ // In contexts which DO support the Blacklisted flag (v403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {}
+
+ // Without shortcuts, in contexts which DO support the Blacklisted flag (v403)
+ UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
+ const bool isBlacklisted, const bool isPossiblyOffensive, const int probability,
+ const HistoricalInfo historicalInfo)
+ : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
+ mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted),
+ mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability),
+ mHistoricalInfo(historicalInfo), mShortcuts() {}
+
+ bool representsBeginningOfSentence() const {
+ return mRepresentsBeginningOfSentence;
+ }
+
+ bool isNotAWord() const {
+ return mIsNotAWord;
+ }
+
+ bool isPossiblyOffensive() const {
+ return mIsPossiblyOffensive;
+ }
+
+ bool isBlacklisted() const {
+ return mIsBlacklisted;
+ }
+
+ bool hasShortcuts() const {
+ return !mShortcuts.empty();
+ }
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ const HistoricalInfo getHistoricalInfo() const {
+ return mHistoricalInfo;
+ }
+
+ const std::vector<ShortcutProperty> &getShortcuts() const {
+ return mShortcuts;
+ }
+
+ private:
+ // Default copy constructor is used for using as a return value.
+ DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
+
+ const bool mRepresentsBeginningOfSentence;
+ const bool mIsNotAWord;
+ const bool mIsBlacklisted;
+ const bool mIsPossiblyOffensive;
+ const int mProbability;
+ const HistoricalInfo mHistoricalInfo;
+ const std::vector<ShortcutProperty> mShortcuts;
+};
+} // namespace latinime
+#endif // LATINIME_UNIGRAM_PROPERTY_H
diff --git a/native/jni/src/dictionary/property/word_attributes.h b/native/jni/src/dictionary/property/word_attributes.h
new file mode 100644
index 000000000..5351e7d7d
--- /dev/null
+++ b/native/jni/src/dictionary/property/word_attributes.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_WORD_ATTRIBUTES_H
+#define LATINIME_WORD_ATTRIBUTES_H
+
+#include "defines.h"
+
+class WordAttributes {
+ public:
+ // Invalid word attributes.
+ WordAttributes()
+ : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false),
+ mIsPossiblyOffensive(false) {}
+
+ WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord,
+ const bool isPossiblyOffensive)
+ : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord),
+ mIsPossiblyOffensive(isPossiblyOffensive) {}
+
+ int getProbability() const {
+ return mProbability;
+ }
+
+ bool isBlacklisted() const {
+ return mIsBlacklisted;
+ }
+
+ bool isNotAWord() const {
+ return mIsNotAWord;
+ }
+
+ // Whether or not a word is possibly offensive.
+ // * Static dictionaries <v202, as well as dynamic dictionaries <v403, will set this based on
+ // whether or not the probability of the word is zero.
+ // * Static dictionaries >=v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag.
+ // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model
+ // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero)
+ //
+ // See the ::getWordAttributes function for each of these dictionary policies for more details.
+ bool isPossiblyOffensive() const {
+ return mIsPossiblyOffensive;
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes);
+
+ int mProbability;
+ bool mIsBlacklisted;
+ bool mIsNotAWord;
+ bool mIsPossiblyOffensive;
+};
+
+ // namespace
+#endif /* LATINIME_WORD_ATTRIBUTES_H */
diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/dictionary/property/word_property.h
index aa3e0b68a..3028e020a 100644
--- a/native/jni/src/suggest/core/dictionary/property/word_property.h
+++ b/native/jni/src/dictionary/property/word_property.h
@@ -20,9 +20,9 @@
#include <vector>
#include "defines.h"
-#include "jni.h"
-#include "suggest/core/dictionary/property/bigram_property.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -31,23 +31,23 @@ class WordProperty {
public:
// Default constructor is used to create an instance that indicates an invalid word.
WordProperty()
- : mCodePoints(), mUnigramProperty(), mBigrams() {}
+ : mCodePoints(), mUnigramProperty(), mNgrams() {}
- WordProperty(const std::vector<int> *const codePoints,
- const UnigramProperty *const unigramProperty,
- const std::vector<BigramProperty> *const bigrams)
- : mCodePoints(*codePoints), mUnigramProperty(*unigramProperty), mBigrams(*bigrams) {}
+ WordProperty(const std::vector<int> &&codePoints, const UnigramProperty &unigramProperty,
+ const std::vector<NgramProperty> &ngrams)
+ : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty),
+ mNgrams(ngrams) {}
- void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
- jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
- jobject outShortcutTargets, jobject outShortcutProbabilities) const;
+ const CodePointArrayView getCodePoints() const {
+ return CodePointArrayView(mCodePoints);
+ }
- const UnigramProperty *getUnigramProperty() const {
- return &mUnigramProperty;
+ const UnigramProperty &getUnigramProperty() const {
+ return mUnigramProperty;
}
- const std::vector<BigramProperty> *getBigramProperties() const {
- return &mBigrams;
+ const std::vector<NgramProperty> &getNgramProperties() const {
+ return mNgrams;
}
private:
@@ -56,7 +56,7 @@ class WordProperty {
const std::vector<int> mCodePoints;
const UnigramProperty mUnigramProperty;
- const std::vector<BigramProperty> mBigrams;
+ const std::vector<NgramProperty> mNgrams;
};
} // namespace latinime
#endif // LATINIME_WORD_PROPERTY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/dictionary/structure/backward/v402/Readme.txt
index 9e29e836c..9e29e836c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt
+++ b/native/jni/src/dictionary/structure/backward/v402/Readme.txt
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp
index 3e8e059f2..60749bce6 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp
@@ -19,18 +19,18 @@
* Do not edit this file other than updating policy's interface.
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp
+ * dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
-#include "suggest/core/dictionary/property/bigram_property.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
namespace backward {
@@ -60,7 +60,7 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out
}
bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
// 1. The word has no bigrams yet.
// 2. The word has bigrams, and there is the target in the list.
// 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
@@ -79,7 +79,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
- bigramProperty);
+ ngramProperty);
// Write an entry.
const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) {
@@ -112,7 +112,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
- &newBigramEntry, bigramProperty);
+ &newBigramEntry, ngramProperty);
if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) {
return false;
}
@@ -138,7 +138,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget
const BigramEntry updatedBigramEntry =
originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
- &updatedBigramEntry, bigramProperty);
+ &updatedBigramEntry, ngramProperty);
return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
}
@@ -264,18 +264,17 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
const BigramEntry *const originalBigramEntry,
- const BigramProperty *const bigramProperty) const {
+ const NgramProperty *const ngramProperty) const {
// TODO: Consolidate historical info and probability.
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
- const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(),
- bigramProperty->getLevel(), bigramProperty->getCount());
+ const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo();
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(),
+ originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(),
&historicalInfoForUpdate, mHeaderPolicy);
return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
} else {
- return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability());
+ return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability());
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h
index 50a4c9743..58c88ce8a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h
+++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h
@@ -26,8 +26,8 @@
#define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H
#include "defines.h"
-#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/structure/backward/v402/content/bigram_entry.h"
namespace latinime {
namespace backward {
@@ -36,7 +36,7 @@ namespace v402 {
class BigramDictContent;
} // namespace v402
} // namespace backward
-class BigramProperty;
+class NgramProperty;
namespace backward {
namespace v402 {
} // namespace v402
@@ -64,7 +64,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
}
bool addNewEntry(const int terminalId, const int newTargetTerminalId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
bool removeEntry(const int terminalId, const int targetTerminalId);
@@ -80,7 +80,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
int *const outTailEntryPos) const;
const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
- const BigramProperty *const bigramProperty) const;
+ const NgramProperty *const ngramProperty) const;
bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
index e2dd93c5e..7fa85dec2 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp
@@ -18,12 +18,12 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
+ * dictionary/structure/v4/content/bigram_dict_content.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h"
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
@@ -65,6 +65,8 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
(encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ?
Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId;
if (mHasHistoricalInfo) {
+ // Hack for better migration.
+ count += level;
const HistoricalInfo historicalInfo(timestamp, level, count);
return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId);
} else {
@@ -83,10 +85,10 @@ bool BigramDictContent::writeBigramEntryAndAdvancePosition(
}
if (mHasHistoricalInfo) {
const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo();
- if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(),
+ if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(),
Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) {
AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos,
- historicalInfo->getTimeStamp());
+ historicalInfo->getTimestamp());
return false;
}
if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(),
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h
index b554e5676..14f334a12 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h
@@ -18,17 +18,17 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h
+ * dictionary/structure/v4/content/bigram_dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H
#define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/content/bigram_entry.h"
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h
index 40968b4d8..36ad855ee 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h
@@ -18,15 +18,15 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h
+ * dictionary/structure/v4/content/bigram_entry.h
*/
#ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H
#define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/historical_info.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h
index 0f2f25534..d3b84fa04 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h
@@ -18,7 +18,7 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/dict_content.h
+ * dictionary/structure/v4/content/dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp
index c671647d4..b167f0ab2 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp
@@ -18,15 +18,15 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp
+ * dictionary/structure/v4/content/probability_dict_content.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
@@ -50,7 +50,8 @@ const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int ter
Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos);
const int count = buffer->readUintAndAdvancePosition(
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos);
- const HistoricalInfo historicalInfo(timestamp, level, count);
+ // Hack for better migration.
+ const HistoricalInfo historicalInfo(timestamp, level, count + level);
return ProbabilityEntry(flags, probability, &historicalInfo);
} else {
return ProbabilityEntry(flags, probability);
@@ -74,8 +75,8 @@ bool ProbabilityDictContent::setProbabilityEntry(const int terminalId,
return false;
}
writingPos += getEntrySize();
- mSize++;
}
+ mSize = terminalId + 1;
}
return writeEntry(probabilityEntry, entryPos);
}
@@ -100,7 +101,6 @@ bool ProbabilityDictContent::flushToFile(const char *const dictPath) const {
bool ProbabilityDictContent::runGC(
const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
const ProbabilityDictContent *const originalProbabilityDictContent) {
- mSize = 0;
for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
it != terminalIdMap->end(); ++it) {
const ProbabilityEntry probabilityEntry =
@@ -109,7 +109,6 @@ bool ProbabilityDictContent::runGC(
AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second);
return false;
}
- mSize++;
}
return true;
}
@@ -147,7 +146,7 @@ bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilit
}
if (mHasHistoricalInfo) {
const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo();
- if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(),
+ if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(),
Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) {
AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos);
return false;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h
index 3734797d4..464b29f3f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h
@@ -18,17 +18,17 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h
+ * dictionary/structure/v4/content/probability_dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H
#define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/backward/v402/content/single_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h
index 8ccfa33dc..94e36bf51 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h
@@ -18,15 +18,15 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h
+ * dictionary/structure/v4/content/probability_entry.h
*/
#ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H
#define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/historical_info.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp
index 56bc8b98d..e538a02a1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp
@@ -18,12 +18,12 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp
+ * dictionary/structure/v4/content/shortcut_dict_content.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h
index 179cec5bb..3b725e896 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h
@@ -18,16 +18,16 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h
+ * dictionary/structure/v4/content/shortcut_dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H
#define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h
index 49f446814..89df2a1e0 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h
@@ -18,18 +18,18 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h
+ * dictionary/structure/v4/content/single_dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H
#define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/structure/backward/v402/content/dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
#include "utils/byte_array_view.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp
index 7c9b4967a..280f0f85a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp
@@ -18,10 +18,10 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp
+ * dictionary/structure/v4/content/sparse_table_dict_content.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
+#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h
index 3c626df11..4b5af87ad 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h
@@ -18,19 +18,19 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h
+ * dictionary/structure/v4/content/sparse_table_dict_content.h
*/
#ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H
#define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/sparse_table.h"
+#include "dictionary/structure/backward/v402/content/dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
+#include "dictionary/utils/sparse_table.h"
#include "utils/byte_array_view.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp
index a9f841779..30b72bbd1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp
@@ -18,13 +18,13 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
+ * dictionary/structure/v4/content/terminal_position_lookup_table.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h
index eadfe0faa..641c7496f 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h
+++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h
@@ -18,7 +18,7 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h
+ * dictionary/structure/v4/content/terminal_position_lookup_table.h
*/
#ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H
@@ -27,8 +27,8 @@
#include <unordered_map>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/content/single_dict_content.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h
index 941fda748..8cda8c5cf 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h
+++ b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h
@@ -19,17 +19,17 @@
* Do not edit this file other than updating policy's interface.
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
+ * dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H
#define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H
#include "defines.h"
-#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp
index 3dfbd1c94..4a9704f4d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp
@@ -18,18 +18,18 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp
+ * dictionary/structure/v4/ver4_dict_buffers.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
#include <cerrno>
#include <cstring>
#include <sys/stat.h>
#include <sys/types.h>
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
#include "utils/byte_array_view.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h
index e775be52e..0d09fee9a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h
@@ -18,7 +18,7 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h
+ * dictionary/structure/v4/ver4_dict_buffers.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H
@@ -27,14 +27,14 @@
#include <memory>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/content/bigram_dict_content.h"
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
+#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/mmapped_buffer.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp
index 81d85f495..2948d0716 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp
@@ -18,10 +18,10 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp
+ * dictionary/structure/v4/ver4_dict_constants.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h
index 88ebd6a75..15581d852 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h
@@ -18,7 +18,7 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h
+ * dictionary/structure/v4/ver4_dict_constants.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
index 82399f190..871ef7aaf 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp
@@ -18,18 +18,19 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
+ * dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/backward/v402/content/probability_dict_content.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
namespace backward {
@@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
- const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
+ const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
int probability = NOT_A_PROBABILITY;
@@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else {
- return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
+ return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
newSiblingNodePos);
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h
index 1999a51a6..367d6f9f8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h
@@ -18,15 +18,15 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
+ * dictionary/structure/v4/ver4_patricia_trie_node_reader.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H
#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp
index 278f2b199..e3ab5ec20 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp
@@ -18,23 +18,23 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
+ * dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
namespace backward {
@@ -232,10 +232,10 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
}
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
- if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewEntry)) {
- AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
- sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId());
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) {
+ if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) {
+ AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d",
+ prevWordIds[0], wordId);
return false;
}
const int ptNodePos =
@@ -245,7 +245,7 @@ bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds
if (!sourcePtNodeParams.hasBigrams()) {
// Update has bigrams flag.
return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(),
- sourcePtNodeParams.isBlacklisted(), sourcePtNodeParams.isNotAWord(),
+ sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(),
sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(),
true /* hasBigrams */,
sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */);
@@ -310,13 +310,13 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN
const int shortcutProbability) {
if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
targetCodePoints, targetCodePointCount, shortcutProbability)) {
- AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId());
+ AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId());
return false;
}
if (!ptNodeParams->hasShortcutTargets()) {
// Update has shortcut targets flag.
return updatePtNodeFlags(ptNodeParams->getHeadPos(),
- ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
+ ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(),
ptNodeParams->isTerminal(), true /* hasShortcutTargets */,
ptNodeParams->hasBigrams(),
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
@@ -330,7 +330,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags(
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
ptNodeParams->getTerminalId()) != NOT_A_DICT_POS;
- return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(),
+ return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(),
ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets,
hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
}
@@ -386,8 +386,9 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
return false;
}
- return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
- isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(),
+ return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(),
+ ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(),
+ ptNodeParams->hasBigrams(),
ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
}
@@ -396,8 +397,7 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
const UnigramProperty *const unigramProperty) const {
// TODO: Consolidate historical info and probability.
if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
- const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(),
- unigramProperty->getLevel(), unigramProperty->getCount());
+ const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo();
const HistoricalInfo updatedHistoricalInfo =
ForgettingCurveUtils::createUpdatedHistoricalInfo(
originalProbabilityEntry->getHistoricalInfo(),
@@ -425,6 +425,18 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
return true;
}
+bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) {
+ if (!mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ // Require historical info to suppress unigram entry.
+ return false;
+ }
+ const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */);
+ const ProbabilityEntry probabilityEntryToWrite =
+ ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo);
+ return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry(
+ ptNodeParams->getTerminalId(), &probabilityEntryToWrite);
+}
+
} // namespace v402
} // namespace backward
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h
index d49d9a666..db3cea174 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h
@@ -18,17 +18,17 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
+ * dictionary/structure/v4/ver4_patricia_trie_node_writer.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H
#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/structure/backward/v402/content/probability_entry.h"
#include "utils/int_array_view.h"
namespace latinime {
@@ -94,7 +94,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
@@ -111,6 +111,11 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams);
+ // Suppress unigram not to use the word for generating suggestions. So, this method can be used
+ // only for dictionaries with historical info. Also, suppressed entries are included in unigram
+ // count. They will be removed from the dictionary during GC.
+ bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams);
+
private:
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
index 1296b8acd..6fb9cffb7 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp
@@ -19,24 +19,25 @@
* Do not edit this file other than updating policy's interface.
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+ * dictionary/structure/v4/ver4_patricia_trie_policy.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
#include <vector>
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
-#include "suggest/core/dictionary/ngram_listener.h"
-#include "suggest/core/dictionary/property/bigram_property.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
-#include "suggest/core/dictionary/property/word_property.h"
-#include "suggest/core/session/prev_words_info.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
-#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
namespace latinime {
namespace backward {
@@ -51,6 +52,7 @@ const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_C
const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1;
void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const {
@@ -76,12 +78,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
// Skip PtNodes that represent non-word information.
continue;
}
- childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
- ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
- ptNodeParams.hasChildren(),
- ptNodeParams.isBlacklisted()
- || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
- ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
+ const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
+ wordId, ptNodeParams.getCodePointArrayView());
}
if (readingHelper.isError()) {
mIsCorrupted = true;
@@ -89,13 +88,13 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d
}
}
-int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
- const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const {
+int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
readingHelper.initWithPtNodePos(ptNodePos);
- const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
- maxCodePointCount, outCodePoints, outUnigramProbability);
+ const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
+ maxCodePointCount, outCodePoints);
if (readingHelper.isError()) {
mIsCorrupted = true;
AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
@@ -103,72 +102,143 @@ int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
return codePointCount;
}
-int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const {
+int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition());
- const int ptNodePos =
- readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch);
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
if (readingHelper.isError()) {
mIsCorrupted = true;
- AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ AKLOGE("Dictionary reading error in getWordId().");
+ }
+ return getWordIdFromTerminalPtNodePos(ptNodePos);
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
}
- return ptNodePos;
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (multiBigramMap) {
+ const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
+ prevWordIds, wordId, ptNodeParams.getProbability());
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ if (!prevWordIds.empty()) {
+ const int probability = getProbabilityOfWord(prevWordIds, wordId);
+ if (probability != NOT_A_PROBABILITY) {
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ }
+ return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
+ ptNodeParams);
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const {
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
+ ptNodeParams.getProbability() == 0);
}
int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
const int bigramProbability) const {
- if (mHeaderPolicy->isDecayingDict()) {
- // Both probabilities are encoded. Decode them and get probability.
- return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
- } else {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return ProbabilityUtils::backoff(unigramProbability);
- } else {
- return bigramProbability;
- }
+ // In the v4 format, bigramProbability is a conditional probability.
+ const int bigramConditionalProbability = bigramProbability;
+ if (unigramProbability == NOT_A_PROBABILITY) {
+ return NOT_A_PROBABILITY;
}
+ if (bigramConditionalProbability == NOT_A_PROBABILITY) {
+ return ProbabilityUtils::backoff(unigramProbability);
+ }
+ return bigramConditionalProbability;
}
-int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos,
- const int ptNodePos) const {
- if (ptNodePos == NOT_A_DICT_POS) {
+int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
return NOT_A_PROBABILITY;
}
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
- if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
+ if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) {
return NOT_A_PROBABILITY;
}
- if (prevWordsPtNodePos) {
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
- BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
- while (bigramsIt.hasNext()) {
- bigramsIt.next();
- if (bigramsIt.getBigramPos() == ptNodePos
- && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
- return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
- }
- }
+ if (prevWordIds.empty()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ if (prevWordIds[0] == NOT_A_WORD_ID) {
return NOT_A_PROBABILITY;
}
- return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
+ return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
+ }
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
+ BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
+ while (bigramsIt.hasNext()) {
+ bigramsIt.next();
+ if (bigramsIt.getBigramPos() == ptNodePos
+ && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(),
+ prevWordPtNodeParams.representsBeginningOfSentence(),
+ bigramsIt.getProbability());
+ return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability);
+ }
+ }
+ return NOT_A_PROBABILITY;
}
-void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos,
+void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const {
- if (!prevWordsPtNodePos) {
+ if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) {
+ return;
+ }
+ const PtNodeParams prevWordPtNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]);
+ if (prevWordPtNodeParams.isDeleted()) {
return;
}
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
+ const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos(
+ prevWordPtNodeParams.getTerminalId());
BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
while (bigramsIt.hasNext()) {
bigramsIt.next();
- listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos());
+ const int bigramConditionalProbability = getBigramConditionalProbability(
+ prevWordPtNodeParams.getProbability(),
+ prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability());
+ listener->onVisitEntry(bigramConditionalProbability,
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
}
}
+int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const bool isInBeginningOfSentenceContext, const int bigramProbability) const {
+ if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
+ if (isInBeginningOfSentenceContext) {
+ return bigramProbability;
+ }
+ // Calculate conditional probability.
+ return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability,
+ MAX_PROBABILITY);
+ } else {
+ // bigramProbability is a conditional probability.
+ return bigramProbability;
+ }
+}
+
+BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
+ const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
+ return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
+}
+
int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
if (ptNodePos == NOT_A_DICT_POS) {
return NOT_A_DICT_POS;
@@ -193,7 +263,7 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons
ptNodeParams.getTerminalId());
}
-bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
const UnigramProperty *const unigramProperty) {
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
@@ -204,13 +274,14 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
mDictBuffer->getTailPosition());
return false;
}
- if (length > MAX_WORD_LENGTH) {
- AKLOGE("The word is too long to insert to the dictionary, length: %d", length);
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert to the dictionary, length: %zd",
+ wordCodePoints.size());
return false;
}
for (const auto &shortcut : unigramProperty->getShortcuts()) {
if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
- AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d",
+ AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
shortcut.getTargetCodePoints()->size());
return false;
}
@@ -219,8 +290,8 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
readingHelper.initWithPtNodeArrayPos(getRootPosition());
bool addedNewUnigram = false;
int codePointsToAdd[MAX_WORD_LENGTH];
- int codePointCountToAdd = length;
- memmove(codePointsToAdd, word, sizeof(int) * length);
+ int codePointCountToAdd = wordCodePoints.size();
+ memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
if (unigramProperty->representsBeginningOfSentence()) {
codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
codePointCountToAdd, MAX_WORD_LENGTH);
@@ -228,24 +299,25 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
if (codePointCountToAdd <= 0) {
return false;
}
- if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
- unigramProperty, &addedNewUnigram)) {
+ const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
+ &addedNewUnigram)) {
if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
- mUnigramCount++;
+ mEntryCounters.incrementNgramCount(NgramType::Unigram);
}
if (unigramProperty->getShortcuts().size() > 0) {
// Add shortcut target.
- const int wordPos = getTerminalPtNodePositionOfWord(word, length,
- false /* forceLowerCaseSearch */);
+ const int wordPos = getTerminalPtNodePosFromWordId(
+ getWordId(codePointArrayView, false /* forceLowerCaseSearch */));
if (wordPos == NOT_A_DICT_POS) {
AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
return false;
}
for (const auto &shortcut : unigramProperty->getShortcuts()) {
if (!mUpdatingHelper.addShortcutTarget(wordPos,
- shortcut.getTargetCodePoints()->data(),
- shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) {
- AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, "
+ CodePointArrayView(*shortcut.getTargetCodePoints()),
+ shortcut.getProbability())) {
+ AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
"probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
shortcut.getProbability());
return false;
@@ -258,8 +330,21 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le
}
}
-bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty) {
+bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(
+ getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return false;
+ }
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ return mNodeWriter.suppressUnigramEntry(&ptNodeParams);
+}
+
+bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
return false;
@@ -269,50 +354,50 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
mDictBuffer->getTailPosition());
return false;
}
- if (!prevWordsInfo->isValid()) {
- AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
+ const NgramContext *const ngramContext = ngramProperty->getNgramContext();
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
return false;
}
- if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
AKLOGE("The word is too long to insert the ngram to the dictionary. "
- "length: %d", bigramProperty->getTargetCodePoints()->size());
+ "length: %zd", ngramProperty->getTargetCodePoints()->size());
return false;
}
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSearch */);
- // TODO: Support N-gram.
- if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
- if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
- const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ if (prevWordIds.empty()) {
+ return false;
+ }
+ if (prevWordIds[0] == NOT_A_WORD_ID) {
+ if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
const UnigramProperty beginningOfSentenceUnigramProperty(
true /* representsBeginningOfSentence */, true /* isNotAWord */,
- false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
- NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
- if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
- prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */),
+ false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo());
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
&beginningOfSentenceUnigramProperty)) {
AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
return false;
}
- // Refresh Terminal PtNode positions.
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
- false /* tryLowerCaseSearch */);
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
} else {
return false;
}
}
- const int word1Pos = getTerminalPtNodePositionOfWord(
- bigramProperty->getTargetCodePoints()->data(),
- bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */);
- if (word1Pos == NOT_A_DICT_POS) {
+ const int wordPos = getTerminalPtNodePosFromWordId(getWordId(
+ CodePointArrayView(*ngramProperty->getTargetCodePoints()),
+ false /* forceLowerCaseSearch */));
+ if (wordPos == NOT_A_DICT_POS) {
return false;
}
bool addedNewBigram = false;
- if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::fromObject(prevWordsPtNodePos),
- word1Pos, bigramProperty, &addedNewBigram)) {
+ const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
+ if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos),
+ wordPos, ngramProperty, &addedNewBigram)) {
if (addedNewBigram) {
- mBigramCount++;
+ mEntryCounters.incrementNgramCount(NgramType::Bigram);
}
return true;
} else {
@@ -320,8 +405,8 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI
}
}
-bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const int *const word, const int length) {
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
return false;
@@ -331,40 +416,68 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor
mDictBuffer->getTailPosition());
return false;
}
- if (!prevWordsInfo->isValid()) {
- AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary.");
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
return false;
}
- if (length > MAX_WORD_LENGTH) {
- AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length);
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
+ wordCodePoints.size());
}
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
false /* tryLowerCaseSerch */);
- // TODO: Support N-gram.
- if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
+ if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
return false;
}
- const int wordPos = getTerminalPtNodePositionOfWord(word, length,
- false /* forceLowerCaseSearch */);
+ const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints,
+ false /* forceLowerCaseSearch */));
if (wordPos == NOT_A_DICT_POS) {
return false;
}
+ const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]);
if (mUpdatingHelper.removeNgramEntry(
- PtNodePosArrayView::fromObject(prevWordsPtNodePos), wordPos)) {
- mBigramCount--;
+ PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) {
+ mEntryCounters.decrementNgramCount(NgramType::Bigram);
return true;
} else {
return false;
}
}
+
+bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
+ const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
+ const bool isValidWord, const HistoricalInfo historicalInfo) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+ const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY;
+ const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+ false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo);
+ if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
+ AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
+ ? NOT_A_PROBABILITY : probability;
+ const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram,
+ historicalInfo);
+ if (!addNgramEntry(&ngramProperty)) {
+ AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ return true;
+}
+
bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
if (!mBuffers->isUpdatable()) {
AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
return false;
}
- if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) {
+ if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
AKLOGE("Cannot flush the dictionary to file.");
mIsCorrupted = true;
return false;
@@ -402,7 +515,7 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
// Needs to reduce dictionary size.
return true;
} else if (mHeaderPolicy->isDecayingDict()) {
- return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
+ return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
mHeaderPolicy);
}
return false;
@@ -412,41 +525,42 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer
char *const outResult, const int maxResultLength) {
const int compareLength = queryLength + 1 /* terminator */;
if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d", mUnigramCount);
+ snprintf(outResult, maxResultLength, "%d",
+ mEntryCounters.getNgramCount(NgramType::Unigram));
} else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d", mBigramCount);
+ snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
} else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy->isDecayingDict() ?
- ForgettingCurveUtils::getUnigramCountHardLimit(
- mHeaderPolicy->getMaxUnigramCount()) :
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Unigram)) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
} else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
snprintf(outResult, maxResultLength, "%d",
mHeaderPolicy->isDecayingDict() ?
- ForgettingCurveUtils::getBigramCountHardLimit(
- mHeaderPolicy->getMaxBigramCount()) :
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Bigram)) :
static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
}
}
-const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints,
- const int codePointCount) const {
- const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
- false /* forceLowerCaseSearch */);
+const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int ptNodePos = getTerminalPtNodePosFromWordId(
+ getWordId(wordCodePoints, false /* forceLowerCaseSearch */));
if (ptNodePos == NOT_A_DICT_POS) {
AKLOGE("getWordProperty is called for invalid word.");
return WordProperty();
}
const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
- ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
const ProbabilityEntry probabilityEntry =
mBuffers->getProbabilityDictContent()->getProbabilityEntry(
ptNodeParams.getTerminalId());
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
// Fetch bigram information.
- std::vector<BigramProperty> bigrams;
+ std::vector<NgramProperty> ngrams;
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
if (bigramListPos != NOT_A_DICT_POS) {
int bigramWord1CodePoints[MAX_WORD_LENGTH];
@@ -465,21 +579,21 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
continue;
}
- // Word (unigram) probability
- int word1Probability = NOT_A_PROBABILITY;
- const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
- word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
- &word1Probability);
- const std::vector<int> word1(bigramWord1CodePoints,
- bigramWord1CodePoints + codePointCount);
+ const int codePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH,
+ bigramWord1CodePoints);
const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
- const int probability = bigramEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(
- bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
- bigramEntry.getProbability();
- bigrams.emplace_back(&word1, probability,
- historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
- historicalInfo->getCount());
+ const int rawBigramProbability = bigramEntry.hasHistoricalInfo()
+ ? ForgettingCurveUtils::decodeProbability(
+ bigramEntry.getHistoricalInfo(), mHeaderPolicy)
+ : bigramEntry.getProbability();
+ const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(),
+ ptNodeParams.representsBeginningOfSentence(), rawBigramProbability);
+ ngrams.emplace_back(
+ NgramContext(wordCodePoints.data(), wordCodePoints.size(),
+ ptNodeParams.representsBeginningOfSentence()),
+ CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(),
+ probability, *historicalInfo);
}
}
// Fetch shortcut information.
@@ -495,15 +609,15 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code
int shortcutProbability = NOT_A_PROBABILITY;
shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
&shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
- const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength);
- shortcuts.emplace_back(&target, shortcutProbability);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
+ shortcutProbability);
}
}
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
- ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
- historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
- historicalInfo->getCount(), &shortcuts);
- return WordProperty(&codePointVector, &unigramProperty, &bigrams);
+ ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
+ ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
}
int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
@@ -524,9 +638,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
return 0;
}
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
- int unigramProbability = NOT_A_PROBABILITY;
- *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
- terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated.
@@ -536,6 +649,14 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const
return nextToken;
}
+int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
+ return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
+}
+
+int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
+ return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
+}
+
} // namespace v402
} // namespace backward
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
index 9e989b268..bce5f6bea 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h
@@ -19,7 +19,7 @@
* Do not edit this file other than updating policy's interface.
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
+ * dictionary/structure/v4/ver4_patricia_trie_policy.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H
@@ -28,17 +28,21 @@
#include <vector>
#include "defines.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "utils/int_array_view.h"
namespace latinime {
namespace backward {
@@ -55,6 +59,8 @@ class DicNodeVector;
namespace backward {
namespace v402 {
+// Word id = Position of a PtNode that represents the word.
+// Max supported n-gram is bigram.
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public:
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
@@ -70,54 +76,50 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
&mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy),
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
mWritingHelper(mBuffers.get()),
- mUnigramCount(mHeaderPolicy->getUnigramCount()),
- mBigramCount(mHeaderPolicy->getBigramCount()),
+ mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
- AK_FORCE_INLINE int getRootPosition() const {
+ virtual int getRootPosition() const {
return 0;
}
void createAndGetAllChildDicNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const;
- int getCodePointsAndProbabilityAndReturnCodePointCount(
- const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const;
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
- int getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const;
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
+
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
int getProbability(const int unigramProbability, const int bigramProbability) const;
- int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const;
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
- void iterateNgramEntries(const int *const prevWordsPtNodePos,
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const;
- int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
return mHeaderPolicy;
}
- const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
- return &mShortcutPolicy;
- }
-
- bool addUnigramEntry(const int *const word, const int length,
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
const UnigramProperty *const unigramProperty);
- bool removeUnigramEntry(const int *const word, const int length) {
- // Removing unigram entry is not supported.
- return false;
- }
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
+
+ bool addNgramEntry(const NgramProperty *const ngramProperty);
- bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty);
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints);
- bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1,
- const int length1);
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo);
bool flush(const char *const filePath);
@@ -128,8 +130,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
void getProperty(const char *const query, const int queryLength, char *const outResult,
const int maxResultLength);
- const WordProperty getWordProperty(const int *const codePoints,
- const int codePointCount) const;
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
@@ -149,6 +150,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
// prevent the dictionary from overflowing.
static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS;
+ static const int DUMMY_PROBABILITY_FOR_VALID_WORDS;
const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
const HeaderPolicy *const mHeaderPolicy;
@@ -160,12 +162,18 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
Ver4PatriciaTrieNodeWriter mNodeWriter;
DynamicPtUpdatingHelper mUpdatingHelper;
Ver4PatriciaTrieWritingHelper mWritingHelper;
- int mUnigramCount;
- int mBigramCount;
+ MutableEntryCounters mEntryCounters;
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
mutable bool mIsCorrupted;
int getBigramsPositionOfPtNode(const int ptNodePos) const;
+ int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ int getWordIdFromTerminalPtNodePos(const int ptNodePos) const;
+ int getTerminalPtNodePosFromWordId(const int wordId) const;
+ const WordAttributes getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const;
+ int getBigramConditionalProbability(const int prevWordUnigramProbability,
+ const bool isInBeginningOfSentenceContext, const int bigramProbability) const;
};
} // namespace v402
} // namespace backward
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp
index 80d531198..b8a4cf847 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp
@@ -18,12 +18,12 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
+ * dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h
index 3579c26d6..c3e736bdc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h
@@ -18,7 +18,7 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
+ * dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp
index 3fb4caa08..c0af9eae6 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp
@@ -18,43 +18,43 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
+ * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h"
#include <cstring>
#include <queue>
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h"
+#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
namespace backward {
namespace v402 {
bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
- const int unigramCount, const int bigramCount) const {
+ const EntryCounts &entryCounts) const {
const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
BufferWithExtendableBuffer headerBuffer(
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
+ mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
- unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
+ entryCounts, extendedRegionSize, &headerBuffer)) {
AKLOGE("Cannot write header structure to buffer. "
"updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
- "extendedRegionSize: %d", false, unigramCount, bigramCount,
- extendedRegionSize);
+ "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
+ entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize);
return false;
}
return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
@@ -73,8 +73,11 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr
}
BufferWithExtendableBuffer headerBuffer(
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ MutableEntryCounters entryCounters;
+ entryCounters.setNgramCount(NgramType::Unigram, unigramCount);
+ entryCounters.setNgramCount(NgramType::Bigram, bigramCount);
if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
- unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
+ entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
return false;
}
return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
@@ -106,7 +109,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
}
const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
.getValidUnigramCount();
- const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
+ const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram);
if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
@@ -123,7 +126,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
return false;
}
const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
- const int maxBigramCount = headerPolicy->getMaxBigramCount();
+ const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram);
if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
if (!truncateBigrams(maxBigramCount)) {
AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
@@ -216,7 +219,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
probabilityEntry.getProbability();
priorityQueue.push(DictProbability(terminalPos, probability,
- probabilityEntry.getHistoricalInfo()->getTimeStamp()));
+ probabilityEntry.getHistoricalInfo()->getTimestamp()));
}
// Delete unigrams.
@@ -263,7 +266,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
bigramEntry.getProbability();
priorityQueue.push(DictProbability(entryPos, probability,
- bigramEntry.getHistoricalInfo()->getTimeStamp()));
+ bigramEntry.getHistoricalInfo()->getTimestamp()));
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h
index 9034ee656..f2b873826 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h
@@ -18,15 +18,16 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
+ * dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H
#define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h"
+#include "dictionary/utils/entry_counters.h"
namespace latinime {
namespace backward {
@@ -46,8 +47,7 @@ class Ver4PatriciaTrieWritingHelper {
Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
: mBuffers(buffers) {}
- bool writeToDictFile(const char *const dictDirPath, const int unigramCount,
- const int bigramCount) const;
+ bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const;
// This method cannot be const because the original dictionary buffer will be updated to detect
// useless PtNodes during GC.
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp
index 537a6d420..d27d70816 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp
@@ -18,14 +18,14 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
+ * dictionary/structure/v4/ver4_pt_node_array_reader.cpp
*/
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
+#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h
index 4f8056801..0039bf8fc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h
+++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h
@@ -18,14 +18,14 @@
* !!!!! DO NOT EDIT THIS FILE !!!!!
*
* This file was generated from
- * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h
+ * dictionary/structure/v4/ver4_pt_node_array_reader.h
*/
#ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H
#define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
namespace latinime {
namespace backward {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
index e4ea3da16..4470e8568 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
+++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp
@@ -14,23 +14,23 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h"
+#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h"
#include <climits>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/backward/v402/ver4_dict_constants.h"
+#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/v2/patricia_trie_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
#include "utils/byte_array_view.h"
namespace latinime {
@@ -58,7 +58,7 @@ namespace latinime {
const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) {
FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion);
switch (dictFormatVersion) {
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_402: {
return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
@@ -66,7 +66,7 @@ namespace latinime {
dictFormatVersion, locale, attributeMap);
}
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV: {
+ case FormatUtils::VERSION_403: {
return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
dictFormatVersion, locale, attributeMap);
@@ -111,13 +111,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
return nullptr;
}
const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion(
- mmappedBuffer->getReadOnlyByteArrayView().data(),
- mmappedBuffer->getReadOnlyByteArrayView().size());
+ mmappedBuffer->getReadOnlyByteArrayView());
switch (formatVersion) {
case FormatUtils::VERSION_2:
- AKLOGE("Given path is a directory but the format is version 2. path: %s", path);
+ case FormatUtils::VERSION_201:
+ case FormatUtils::VERSION_202:
+ AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path);
break;
- case FormatUtils::VERSION_4: {
+ case FormatUtils::VERSION_402: {
return newPolicyForV4Dict<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr,
@@ -125,7 +126,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
headerFilePath, formatVersion, std::move(mmappedBuffer));
}
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV: {
+ case FormatUtils::VERSION_403: {
return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>(
headerFilePath, formatVersion, std::move(mmappedBuffer));
@@ -174,14 +175,17 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str
if (!mmappedBuffer) {
return nullptr;
}
- switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView().data(),
- mmappedBuffer->getReadOnlyByteArrayView().size())) {
+ switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) {
case FormatUtils::VERSION_2:
+ case FormatUtils::VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ break;
+ case FormatUtils::VERSION_202:
return DictionaryStructureWithBufferPolicy::StructurePolicyPtr(
new PatriciaTriePolicy(std::move(mmappedBuffer)));
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_402:
+ case FormatUtils::VERSION_403:
AKLOGE("Given path is a file but the format is version 4. path: %s", path);
break;
default:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h
index 768454d8d..b0c04c0b1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h
+++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h
@@ -20,10 +20,10 @@
#include <vector>
#include "defines.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp
index f7fd5c071..64f9b6663 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp
@@ -14,10 +14,10 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
@@ -39,32 +39,31 @@ const BigramListReadWriteUtils::BigramFlags
BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F;
/* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(
- const uint8_t *const bigramsBuf, const int bufSize, BigramFlags *const outBigramFlags,
+ const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags,
int *const outTargetPtNodePos, int *const bigramEntryPos) {
- if (bufSize <= *bigramEntryPos) {
- AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %d, "
- "bigramEntryPos: %d.", bufSize, *bigramEntryPos);
+ if (static_cast<int>(buffer.size()) <= *bigramEntryPos) {
+ AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, "
+ "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos);
return false;
}
- const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf,
+ const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(),
bigramEntryPos);
if (outBigramFlags) {
*outBigramFlags = bigramFlags;
}
- const int targetPos = getBigramAddressAndAdvancePosition(bigramsBuf, bigramFlags,
- bigramEntryPos);
+ const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos);
if (outTargetPtNodePos) {
*outTargetPtNodePos = targetPos;
}
return true;
}
-/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const uint8_t *const bigramsBuf,
- const int bufSize, int *const bigramListPos) {
+/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer,
+ int *const bigramListPos) {
BigramFlags flags;
do {
- if (!getBigramEntryPropertiesAndAdvancePosition(bigramsBuf, bufSize, &flags,
- 0 /* outTargetPtNodePos */, bigramListPos)) {
+ if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */,
+ bigramListPos)) {
return false;
}
} while(hasNext(flags));
@@ -72,18 +71,18 @@ const BigramListReadWriteUtils::BigramFlags
}
/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition(
- const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) {
+ const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) {
int offset = 0;
const int origin = *pos;
switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) {
case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
- offset = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos);
+ offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos);
break;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
- offset = ByteArrayUtils::readUint16AndAdvancePosition(bigramsBuf, pos);
+ offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos);
break;
case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
- offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos);
+ offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos);
break;
}
if (isOffsetNegative(flags)) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h
index 10f93fb7a..a0f7d5e83 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h
+++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h
@@ -21,6 +21,7 @@
#include <cstdlib>
#include "defines.h"
+#include "utils/byte_array_view.h"
namespace latinime {
@@ -30,8 +31,8 @@ class BigramListReadWriteUtils {
public:
typedef uint8_t BigramFlags;
- static bool getBigramEntryPropertiesAndAdvancePosition(const uint8_t *const bigramsBuf,
- const int bufSize, BigramFlags *const outBigramFlags, int *const outTargetPtNodePos,
+ static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer,
+ BigramFlags *const outBigramFlags, int *const outTargetPtNodePos,
int *const bigramEntryPos);
static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) {
@@ -43,8 +44,7 @@ public:
}
// Bigrams reading methods
- static bool skipExistingBigrams(const uint8_t *const bigramsBuf, const int bufSize,
- int *const bigramListPos);
+ static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils);
@@ -61,7 +61,7 @@ private:
return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0;
}
- static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf,
+ static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer,
const BigramFlags flags, int *const pos);
};
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
index db1a802d0..b5e2e9dae 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp
@@ -14,12 +14,12 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h
index 2aa402748..8c7ad965b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h
@@ -20,9 +20,9 @@
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
@@ -76,6 +76,7 @@ class DynamicPtGcEventListeners {
int mValidUnigramCount;
};
+ // TODO: Remove when we stop supporting v402 format.
// Updates all bigram entries that are held by valid PtNodes. This removes useless bigram
// entries.
class TraversePolicyToUpdateBigramProbability
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
index 086d98b4a..294bc6ea9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
#include "utils/char_utils.h"
namespace latinime {
@@ -175,8 +175,8 @@ bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFi
return !isError();
}
-int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount(
- const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) {
+int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount,
+ int *const outCodePoints) {
// This method traverses parent nodes from the terminal by following parent pointers; thus,
// node code points are stored in the buffer in the reverse order.
int reverseCodePoints[maxCodePointCount];
@@ -184,11 +184,8 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount(
// First, read the terminal node and get its probability.
if (!isValidTerminalNode(terminalPtNodeParams)) {
// Node at the ptNodePos is not a valid terminal node.
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
- // Store terminal node probability.
- *outUnigramProbability = terminalPtNodeParams.getProbability();
// Then, following parent node link to the dictionary root and fetch node code points.
int totalCodePointCount = 0;
while (!isEnd()) {
@@ -196,7 +193,6 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount(
totalCodePointCount = getTotalCodePointCount(ptNodeParams);
if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) {
// The ptNodePos is not a valid terminal node position in the dictionary.
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Store node code points to buffer in the reverse order.
@@ -207,7 +203,6 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount(
}
if (isError()) {
// The node position or the dictionary is invalid.
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
// Reverse the stored code points to output them.
@@ -218,9 +213,9 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount(
}
int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) {
+ const size_t length, const bool forceLowerCaseSearch) {
int searchCodePoints[length];
- for (int i = 0; i < length; ++i) {
+ for (size_t i = 0; i < length; ++i) {
searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i];
}
while (!isEnd()) {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
index b7262581a..d8ddc7c2b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h
@@ -21,8 +21,8 @@
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
namespace latinime {
@@ -138,12 +138,12 @@ class DynamicPtReadingHelper {
}
// Return code point count exclude the last read node's code points.
- AK_FORCE_INLINE int getPrevTotalCodePointCount() const {
+ AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const {
return mReadingState.mTotalCodePointCountSinceInitialization;
}
// Return code point count include the last read node's code points.
- AK_FORCE_INLINE int getTotalCodePointCount(const PtNodeParams &ptNodeParams) const {
+ AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const {
return mReadingState.mTotalCodePointCountSinceInitialization
+ ptNodeParams.getCodePointCount();
}
@@ -211,10 +211,9 @@ class DynamicPtReadingHelper {
bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
TraversingEventListener *const listener);
- int getCodePointsAndProbabilityAndReturnCodePointCount(const int maxCodePointCount,
- int *const outCodePoints, int *const outUnigramProbability);
+ int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints);
- int getTerminalPtNodePositionOfWord(const int *const inWord, const int length,
+ int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length,
const bool forceLowerCaseSearch);
private:
@@ -234,7 +233,7 @@ class DynamicPtReadingHelper {
int mPos;
// Remaining node count in the current array.
int mRemainingPtNodeCountInThisArray;
- int mTotalCodePointCountSinceInitialization;
+ size_t mTotalCodePointCountSinceInitialization;
// Counter of PtNodes used to avoid infinite loops caused by broken or malicious links.
int mTotalPtNodeIndexInThisArrayChain;
// Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp
index 3586b50ab..3eb55ed9b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp
@@ -14,10 +14,10 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h
index b13a075d5..b13a075d5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp
index 3c62e2e56..ccad345c8 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp
@@ -14,31 +14,30 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3;
-bool DynamicPtUpdatingHelper::addUnigramWord(
- DynamicPtReadingHelper *const readingHelper,
- const int *const wordCodePoints, const int codePointCount,
- const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) {
+bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper,
+ const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
+ bool *const outAddedNewUnigram) {
int parentPos = NOT_A_DICT_POS;
while (!readingHelper->isEnd()) {
const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams());
if (!ptNodeParams.isValid()) {
break;
}
- const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
+ const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount();
if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */,
wordCodePoints[matchedCodePointCount])) {
// The first code point is different from target code point. Skip this node and read
@@ -47,26 +46,25 @@ bool DynamicPtUpdatingHelper::addUnigramWord(
continue;
}
// Check following merged node code points.
- const int nodeCodePointCount = ptNodeParams.getCodePointCount();
- for (int j = 1; j < nodeCodePointCount; ++j) {
- const int nextIndex = matchedCodePointCount + j;
- if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
- wordCodePoints[matchedCodePointCount + j])) {
+ const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size();
+ for (size_t j = 1; j < nodeCodePointCount; ++j) {
+ const size_t nextIndex = matchedCodePointCount + j;
+ if (nextIndex >= wordCodePoints.size()
+ || !readingHelper->isMatchedCodePoint(ptNodeParams, j,
+ wordCodePoints[matchedCodePointCount + j])) {
*outAddedNewUnigram = true;
return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty,
- wordCodePoints + matchedCodePointCount,
- codePointCount - matchedCodePointCount);
+ wordCodePoints.skip(matchedCodePointCount));
}
}
// All characters are matched.
- if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) {
+ if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) {
return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram);
}
if (!ptNodeParams.hasChildren()) {
*outAddedNewUnigram = true;
return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty,
- wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams),
- codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams));
+ wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams)));
}
// Advance to the children nodes.
parentPos = ptNodeParams.getHeadPos();
@@ -79,13 +77,12 @@ bool DynamicPtUpdatingHelper::addUnigramWord(
int pos = readingHelper->getPosOfLastForwardLinkField();
*outAddedNewUnigram = true;
return createAndInsertNodeIntoPtNodeArray(parentPos,
- wordCodePoints + readingHelper->getPrevTotalCodePointCount(),
- codePointCount - readingHelper->getPrevTotalCodePointCount(),
- unigramProperty, &pos);
+ wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty,
+ &pos);
}
bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
- const int wordPos, const BigramProperty *const bigramProperty,
+ const int wordPos, const NgramProperty *const ngramProperty,
bool *const outAddedNewEntry) {
if (prevWordsPtNodePos.empty()) {
return false;
@@ -99,7 +96,7 @@ bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPt
const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size());
const int wordId =
mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId();
- return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry);
+ return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry);
}
bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos,
@@ -120,23 +117,21 @@ bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWord
}
bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos,
- const int *const targetCodePoints, const int targetCodePointCount,
- const int shortcutProbability) {
+ const CodePointArrayView targetCodePoints, const int shortcutProbability) {
const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos));
- return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount,
- shortcutProbability);
+ return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(),
+ targetCodePoints.size(), shortcutProbability);
}
bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos,
- const int *const nodeCodePoints, const int nodeCodePointCount,
- const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) {
+ const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
+ int *const forwardLinkFieldPos) {
const int newPtNodeArrayPos = mBuffer->getTailPosition();
if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer,
newPtNodeArrayPos, forwardLinkFieldPos)) {
return false;
}
- return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount,
- unigramProperty);
+ return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty);
}
bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
@@ -151,10 +146,9 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori
const int movedPos = mBuffer->getTailPosition();
int writingPos = movedPos;
const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams,
- unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
true /* isTerminal */, originalPtNodeParams->getParentPos(),
- originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(),
- unigramProperty->getProbability()));
+ originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
unigramProperty, &writingPos)) {
return false;
@@ -168,17 +162,17 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori
bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode(
const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty,
- const int *const codePoints, const int codePointCount) {
+ const CodePointArrayView codePoints) {
const int newPtNodeArrayPos = mBuffer->getTailPosition();
if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) {
return false;
}
return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints,
- codePointCount, unigramProperty);
+ unigramProperty);
}
bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
- const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount,
+ const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints,
const UnigramProperty *const unigramProperty) {
int writingPos = mBuffer->getTailPosition();
if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer,
@@ -186,8 +180,8 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
return false;
}
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
- unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */,
- parentPtNodePos, nodeCodePointCount, nodeCodePoints,
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
+ true /* isTerminal */, parentPtNodePos, ptNodeCodePoints,
unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
unigramProperty, &writingPos)) {
@@ -202,9 +196,9 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode(
// Returns whether the dictionary updating was succeeded or not.
bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
- const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
- const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
- const int newNodeCodePointCount) {
+ const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount,
+ const UnigramProperty *const unigramProperty,
+ const CodePointArrayView newPtNodeCodePoints) {
// When addsExtraChild is true, split the reallocating PtNode and add new child.
// Reallocating PtNode: abcde, newNode: abcxy.
// abc (1st, not terminal) __ de (2nd)
@@ -212,25 +206,26 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
// Otherwise, this method makes 1st part terminal and write information in unigramProperty.
// Reallocating PtNode: abcde, newNode: abc.
// abc (1st, terminal) __ de (2nd)
- const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount;
+ const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount;
const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition();
int writingPos = firstPartOfReallocatedPtNodePos;
// Write the 1st part of the reallocating node. The children position will be updated later
// with actual children position.
+ const CodePointArrayView firstPtNodeCodePoints =
+ reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount);
if (addsExtraChild) {
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
- false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */,
- reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount,
- reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY));
+ false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */,
+ reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints,
+ NOT_A_PROBABILITY));
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) {
return false;
}
} else {
const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode(
- unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
true /* isTerminal */, reallocatingPtNodeParams->getParentPos(),
- overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(),
- unigramProperty->getProbability()));
+ firstPtNodeCodePoints, unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite,
unigramProperty, &writingPos)) {
return false;
@@ -246,20 +241,19 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
// Write the 2nd part of the reallocating node.
const int secondPartOfReallocatedPtNodePos = writingPos;
const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams,
- reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(),
+ reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(),
reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos,
- reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount,
- reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount,
+ reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount),
reallocatingPtNodeParams->getProbability()));
if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) {
return false;
}
if (addsExtraChild) {
const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode(
- unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
+ unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(),
true /* isTerminal */, firstPartOfReallocatedPtNodePos,
- newNodeCodePointCount - overlappingCodePointCount,
- newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability()));
+ newPtNodeCodePoints.skip(overlappingCodePointCount),
+ unigramProperty->getProbability()));
if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams,
unigramProperty, &writingPos)) {
return false;
@@ -282,26 +276,24 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes(
}
const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams(
- const PtNodeParams *const originalPtNodeParams,
- const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos,
- const int codePointCount, const int *const codePoints, const int probability) const {
+ const PtNodeParams *const originalPtNodeParams, const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const {
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
- isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
- false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
+ isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
CHILDREN_POSITION_FIELD_SIZE);
- return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints,
- probability);
+ return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability);
}
-const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(
- const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
- const int parentPos, const int codePointCount, const int *const codePoints,
- const int probability) const {
+const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const {
const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags(
- isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */,
- false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */,
+ isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */,
CHILDREN_POSITION_FIELD_SIZE);
- return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability);
+ return PtNodeParams(flags, parentPos, codePoints, probability);
}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h
index 97c05c1ea..e8cf98c39 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h
@@ -18,12 +18,12 @@
#define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
#include "utils/int_array_view.h"
namespace latinime {
-class BigramProperty;
+class NgramProperty;
class BufferWithExtendableBuffer;
class DynamicPtReadingHelper;
class PtNodeReader;
@@ -40,19 +40,21 @@ class DynamicPtUpdatingHelper {
// Add a word to the dictionary. If the word already exists, update the probability.
bool addUnigramWord(DynamicPtReadingHelper *const readingHelper,
- const int *const wordCodePoints, const int codePointCount,
- const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
+ const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty,
+ bool *const outAddedNewUnigram);
+ // TODO: Remove after stopping supporting v402.
// Add an n-gram entry.
bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
+ // TODO: Remove after stopping supporting v402.
// Remove an n-gram entry.
bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos);
// Add a shortcut target.
- bool addShortcutTarget(const int wordPos, const int *const targetCodePoints,
- const int targetCodePointCount, const int shortcutProbability);
+ bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints,
+ const int shortcutProbability);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper);
@@ -63,33 +65,32 @@ class DynamicPtUpdatingHelper {
const PtNodeReader *const mPtNodeReader;
PtNodeWriter *const mPtNodeWriter;
- bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints,
- const int nodeCodePointCount, const UnigramProperty *const unigramProperty,
+ bool createAndInsertNodeIntoPtNodeArray(const int parentPos,
+ const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty,
int *const forwardLinkFieldPos);
bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams,
const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram);
bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams,
- const UnigramProperty *const unigramProperty, const int *const codePoints,
- const int codePointCount);
+ const UnigramProperty *const unigramProperty,
+ const CodePointArrayView remainingCodePoints);
- bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints,
- const int nodeCodePointCount, const UnigramProperty *const unigramProperty);
+ bool createNewPtNodeArrayWithAChildPtNode(const int parentPos,
+ const CodePointArrayView ptNodeCodePoints,
+ const UnigramProperty *const unigramProperty);
- bool reallocatePtNodeAndAddNewPtNodes(
- const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount,
- const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints,
- const int newNodeCodePointCount);
+ bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams,
+ const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty,
+ const CodePointArrayView newPtNodeCodePoints);
const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams,
- const bool isNotAWord, const bool isBlacklisted, const bool isTerminal,
- const int parentPos, const int codePointCount,
- const int *const codePoints, const int probability) const;
+ const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal,
+ const int parentPos, const CodePointArrayView codePoints, const int probability) const;
- const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isBlacklisted,
- const bool isTerminal, const int parentPos,
- const int codePointCount, const int *const codePoints, const int probability) const;
+ const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord,
+ const bool isPossiblyOffensive, const bool isTerminal, const int parentPos,
+ const CodePointArrayView codePoints, const int probability) const;
};
} // namespace latinime
#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp
index 664aeebbb..ea760a538 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp
@@ -14,13 +14,13 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
#include <cstddef>
#include <cstdint>
#include <cstdlib>
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h
index 362fbd1cc..b4817af41 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h
+++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h
@@ -20,7 +20,7 @@
#include <cstddef>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
index e64a13cc4..e2807c492 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp
@@ -14,12 +14,12 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
#include "defines.h"
-#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
-#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
@@ -41,8 +41,8 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04;
// Flag for non-words (typically, shortcut only entries)
const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02;
-// Flag for blacklist
-const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
+// Flag for possibly offensive words
+const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01;
/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition(
const uint8_t *const buffer, int *const pos) {
@@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
}
/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer,
- int *const pos) {
- return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos);
+ const int *const codePointTable, int *const pos) {
+ return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
// Returns the number of read characters.
/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer,
- const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) {
+ const NodeFlags flags, const int maxLength, const int *const codePointTable,
+ int *const outBuffer, int *const pos) {
int length = 0;
if (hasMultipleChars(flags)) {
- length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer,
- pos);
+ length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable,
+ outBuffer, pos);
} else {
- const int codePoint = getCodePointAndAdvancePosition(buffer, pos);
+ const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos);
if (codePoint == NOT_A_CODE_POINT) {
// CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is
// CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR
@@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
// Returns the number of skipped characters.
/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const pos) {
+ const int maxLength, const int *const codePointTable, int *const pos) {
if (hasMultipleChars(flags)) {
return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos);
} else {
if (maxLength > 0) {
- getCodePointAndAdvancePosition(buffer, pos);
+ getCodePointAndAdvancePosition(buffer, codePointTable, pos);
return 1;
} else {
return 0;
@@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
/* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
- const DictionaryBigramsStructurePolicy *const bigramPolicy,
+ const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable,
NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
int *const outBigramPos, int *const outSiblingPos) {
@@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01;
const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos);
*outFlags = flags;
*outCodePointCount = getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos);
+ dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos);
*outProbability = isTerminal(flags) ?
readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY;
*outChildrenPos = hasChildrenInFlags(flags) ?
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h
index c3f09c3b1..6a2bf5d3c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h
+++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h
@@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils {
static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos);
- static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos);
+ static int getCodePointAndAdvancePosition(const uint8_t *const buffer,
+ const int *const codePointTable, int *const pos);
// Returns the number of read characters.
static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const outBuffer, int *const pos);
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos);
// Returns the number of skipped characters.
static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags,
- const int maxLength, int *const pos);
+ const int maxLength, const int *const codePointTable, int *const pos);
static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos);
@@ -52,8 +54,8 @@ class PatriciaTrieReadingUtils {
/**
* Node Flags
*/
- static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) {
- return (flags & FLAG_IS_BLACKLISTED) != 0;
+ static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) {
+ return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0;
}
static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) {
@@ -80,12 +82,12 @@ class PatriciaTrieReadingUtils {
return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags);
}
- static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted,
+ static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive,
const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets,
const bool hasBigrams, const bool hasMultipleChars,
const int childrenPositionFieldSize) {
NodeFlags nodeFlags = 0;
- nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags;
+ nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags;
nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags;
nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags;
nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags;
@@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils {
static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos,
const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
- NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint,
- int *const outProbability, int *const outChildrenPos, int *const outShortcutPos,
- int *const outBigramPos, int *const outSiblingPos);
+ const int *const codePointTable, NodeFlags *const outFlags,
+ int *const outCodePointCount, int *const outCodePoint, int *const outProbability,
+ int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos,
+ int *const outSiblingPos);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils);
@@ -124,7 +127,7 @@ class PatriciaTrieReadingUtils {
static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS;
static const NodeFlags FLAG_HAS_BIGRAMS;
static const NodeFlags FLAG_IS_NOT_A_WORD;
- static const NodeFlags FLAG_IS_BLACKLISTED;
+ static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE;
};
} // namespace latinime
#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h
index 6078d8285..6078d8285 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h
index b2e60a837..905deb1bc 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h
@@ -20,10 +20,11 @@
#include <cstring>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
#include "utils/char_utils.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -88,9 +89,9 @@ class PtNodeParams {
// Construct new params by updating existing PtNode params.
PtNodeParams(const PtNodeParams *const ptNodeParams,
const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
- const int codePointCount, const int *const codePoints, const int probability)
+ const CodePointArrayView codePoints, const int probability)
: mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true),
- mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(),
+ mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(),
mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()),
mTerminalId(ptNodeParams->getTerminalId()),
mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()),
@@ -101,20 +102,20 @@ class PtNodeParams {
mShortcutPos(ptNodeParams->getShortcutPos()),
mBigramPos(ptNodeParams->getBigramsPos()),
mSiblingPos(ptNodeParams->getSiblingNodePos()) {
- memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount);
+ memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount);
}
PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos,
- const int codePointCount, const int *const codePoints, const int probability)
+ const CodePointArrayView codePoints, const int probability)
: mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos),
- mCodePointCount(codePointCount), mCodePoints(),
+ mCodePointCount(codePoints.size()), mCodePoints(),
mTerminalIdFieldPos(NOT_A_DICT_POS),
mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID),
mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability),
mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS),
mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS),
mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) {
- memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount);
+ memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount);
}
AK_FORCE_INLINE bool isValid() const {
@@ -143,8 +144,8 @@ class PtNodeParams {
return PatriciaTrieReadingUtils::isTerminal(mFlags);
}
- AK_FORCE_INLINE bool isBlacklisted() const {
- return PatriciaTrieReadingUtils::isBlacklisted(mFlags);
+ AK_FORCE_INLINE bool isPossiblyOffensive() const {
+ return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags);
}
AK_FORCE_INLINE bool isNotAWord() const {
@@ -174,11 +175,17 @@ class PtNodeParams {
return mParentPos;
}
+ AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const {
+ return CodePointArrayView(mCodePoints, mCodePointCount);
+ }
+
+ // TODO: Remove
// Number of code points
AK_FORCE_INLINE uint8_t getCodePointCount() const {
return mCodePointCount;
}
+ // TODO: Remove
AK_FORCE_INLINE const int *getCodePoints() const {
return mCodePoints;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h
index 31299a707..15da19e0b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h
@@ -19,7 +19,7 @@
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h
index 955d779ac..e6cad25aa 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h
+++ b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h
@@ -20,12 +20,12 @@
#include <unordered_map>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
#include "utils/int_array_view.h"
namespace latinime {
-class BigramProperty;
+class NgramProperty;
class UnigramProperty;
// Interface class used to write PtNode information.
@@ -72,7 +72,7 @@ class PtNodeWriter {
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0;
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) = 0;
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0;
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
index 91c76941c..14428edd4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
+++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
@@ -31,21 +31,23 @@ const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2;
const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15;
/* static */ ShortcutListReadingUtils::ShortcutFlags
- ShortcutListReadingUtils::getFlagsAndForwardPointer(const uint8_t *const dictRoot,
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer,
int *const pos) {
- return ByteArrayUtils::readUint8AndAdvancePosition(dictRoot, pos);
+ return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos);
}
/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(
- const uint8_t *const dictRoot, int *const pos) {
+ const ReadOnlyByteArrayView buffer, int *const pos) {
// readUint16andAdvancePosition() returns an offset *including* the uint16 field itself.
- return ByteArrayUtils::readUint16AndAdvancePosition(dictRoot, pos)
+ return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos)
- SHORTCUT_LIST_SIZE_FIELD_SIZE;
}
-/* static */ int ShortcutListReadingUtils::readShortcutTarget(
- const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) {
- return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos);
+/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer,
+ const int maxLength, int *const outWord, int *const pos) {
+ // TODO: Use codePointTable for shortcuts.
+ return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength,
+ nullptr /* codePointTable */, outWord, pos);
}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h
index d065bf7fd..71cb8cc2c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h
+++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h
@@ -20,6 +20,7 @@
#include <cstdint>
#include "defines.h"
+#include "utils/byte_array_view.h"
namespace latinime {
@@ -27,7 +28,8 @@ class ShortcutListReadingUtils {
public:
typedef uint8_t ShortcutFlags;
- static ShortcutFlags getFlagsAndForwardPointer(const uint8_t *const dictRoot, int *const pos);
+ static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer,
+ int *const pos);
static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) {
return flags & MASK_ATTRIBUTE_PROBABILITY;
@@ -39,14 +41,15 @@ class ShortcutListReadingUtils {
// This method returns the size of the shortcut list region excluding the shortcut list size
// field at the beginning.
- static int getShortcutListSizeAndForwardPointer(const uint8_t *const dictRoot, int *const pos);
+ static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer,
+ int *const pos);
static AK_FORCE_INLINE int getShortcutListSizeFieldSize() {
return SHORTCUT_LIST_SIZE_FIELD_SIZE;
}
- static AK_FORCE_INLINE void skipShortcuts(const uint8_t *const dictRoot, int *const pos) {
- const int shortcutListSize = getShortcutListSizeAndForwardPointer(dictRoot, pos);
+ static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) {
+ const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos);
*pos += shortcutListSize;
}
@@ -54,7 +57,7 @@ class ShortcutListReadingUtils {
return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY;
}
- static int readShortcutTarget(const uint8_t *const dictRoot, const int maxLength,
+ static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength,
int *const outWord, int *const pos);
private:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h
index 73e291ec2..25081fa04 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h
+++ b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h
@@ -20,24 +20,24 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
+#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
+#include "utils/byte_array_view.h"
namespace latinime {
class BigramListPolicy : public DictionaryBigramsStructurePolicy {
public:
- BigramListPolicy(const uint8_t *const bigramsBuf, const int bufSize)
- : mBigramsBuf(bigramsBuf), mBufSize(bufSize) {}
+ BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}
~BigramListPolicy() {}
void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext,
int *const pos) const {
BigramListReadWriteUtils::BigramFlags flags;
- if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBigramsBuf,
- mBufSize, &flags, outBigramPos, pos)) {
- AKLOGE("Cannot read bigram entry. mBufSize: %d, pos: %d. ", mBufSize, *pos);
+ if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags,
+ outBigramPos, pos)) {
+ AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos);
*outProbability = NOT_A_PROBABILITY;
*outHasNext = false;
return;
@@ -47,14 +47,13 @@ class BigramListPolicy : public DictionaryBigramsStructurePolicy {
}
bool skipAllBigrams(int *const pos) const {
- return BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, mBufSize, pos);
+ return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos);
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy);
- const uint8_t *const mBigramsBuf;
- const int mBufSize;
+ const ReadOnlyByteArrayView mBuffer;
};
} // namespace latinime
#endif // LATINIME_BIGRAM_LIST_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp
index ea32eb2a9..4e8b96b08 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp
+++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp
@@ -14,18 +14,18 @@
* limitations under the License.
*/
-
-#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h"
+#include "dictionary/structure/v2/patricia_trie_policy.h"
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
-#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
-#include "suggest/core/dictionary/ngram_listener.h"
-#include "suggest/core/session/prev_words_info.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
#include "utils/char_utils.h"
namespace latinime {
@@ -36,19 +36,19 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
return;
}
int nextPos = dicNode->getChildrenPtNodeArrayPos();
- if (nextPos < 0 || nextPos >= mDictBufferSize) {
- AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d",
- nextPos, mDictBufferSize);
+ if (!isValidPos(nextPos)) {
+ AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd",
+ nextPos, mBuffer.size());
mIsCorrupted = true;
ASSERT(false);
return;
}
const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
- mDictRoot, &nextPos);
+ mBuffer.data(), &nextPos);
for (int i = 0; i < childCount; i++) {
- if (nextPos < 0 || nextPos >= mDictBufferSize) {
- AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d",
- nextPos, mDictBufferSize, i, childCount);
+ if (!isValidPos(nextPos)) {
+ AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d",
+ nextPos, mBuffer.size(), i, childCount);
mIsCorrupted = true;
ASSERT(false);
return;
@@ -57,7 +57,12 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
}
}
-// This retrieves code points and the probability of the word by its terminal position.
+int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
+ return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount,
+ outCodePoints, nullptr /* outUnigramProbability */);
+}
+// This retrieves code points and the probability of the word by its id.
// Due to the fact that words are ordered in the dictionary in a strict breadth-first order,
// it is possible to check for this with advantageous complexity. For each PtNode array, we search
// for PtNodes with children and compare the children position with the position we look for.
@@ -68,18 +73,22 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo
// with a z, it's the last PtNode of the root array, so all children addresses will be smaller
// than the position we look for, and we have to descend the z PtNode).
/* Parameters :
- * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is
- * what is stored as the "bigram position" in each bigram)
+ * wordId: Id of the word we are searching for.
* outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size.
* outUnigramProbability: a pointer to an int to write the probability into.
* Return value : the code point count, of 0 if the word was not found.
*/
// TODO: Split this function to be more readable
int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
- const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
+ const int wordId, const int maxCodePointCount, int *const outCodePoints,
int *const outUnigramProbability) const {
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
int pos = getRootPosition();
int wordPos = 0;
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
+ if (outUnigramProbability) {
+ *outUnigramProbability = NOT_A_PROBABILITY;
+ }
// One iteration of the outer loop iterates through PtNode arrays. As stated above, we will
// only traverse PtNodes that are actually a part of the terminal we are searching, so each
// time we enter this loop we are one depth level further than last time.
@@ -90,56 +99,57 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
int lastCandidatePtNodePos = 0;
// Let's loop through PtNodes in this PtNode array searching for either the terminal
// or one of its ascendants.
- if (pos < 0 || pos >= mDictBufferSize) {
- AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d",
- pos, mDictBufferSize);
+ if (!isValidPos(pos)) {
+ AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd",
+ pos, mBuffer.size());
mIsCorrupted = true;
ASSERT(false);
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
- mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) {
+ mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) {
const int startPos = pos;
- if (pos < 0 || pos >= mDictBufferSize) {
- AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos, mDictBufferSize);
+ if (!isValidPos(pos)) {
+ AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size());
mIsCorrupted = true;
ASSERT(false);
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
const PatriciaTrieReadingUtils::NodeFlags flags =
- PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos);
+ PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos);
const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &pos);
+ mBuffer.data(), codePointTable, &pos);
if (ptNodePos == startPos) {
// We found the position. Copy the rest of the code points in the buffer and return
// the length.
outCodePoints[wordPos] = character;
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &pos);
+ mBuffer.data(), codePointTable, &pos);
// We count code points in order to avoid infinite loops if the file is broken
// or if there is some other bug
int charCount = maxCodePointCount;
while (NOT_A_CODE_POINT != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &pos);
+ mBuffer.data(), codePointTable, &pos);
}
}
- *outUnigramProbability =
- PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,
- &pos);
+ if (outUnigramProbability) {
+ *outUnigramProbability =
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(
+ mBuffer.data(), &pos);
+ }
return ++wordPos;
}
// We need to skip past this PtNode, so skip any remaining code points after the
// first and possibly the probability.
if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) {
- PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos);
+ PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH,
+ codePointTable, &pos);
}
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
- PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos);
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos);
}
// The fact that this PtNode has children is very important. Since we already know
// that this PtNode does not match, if it has no children we know it is irrelevant
@@ -154,7 +164,8 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
int currentPos = pos;
// Here comes the tricky part. First, read the children position.
const int childrenPos = PatriciaTrieReadingUtils
- ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &currentPos);
+ ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags,
+ &currentPos);
if (childrenPos > ptNodePos) {
// If the children pos is greater than the position, it means the previous
// PtNode, which position is stored in lastCandidatePtNodePos, was the right
@@ -184,30 +195,30 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
if (0 != lastCandidatePtNodePos) {
const PatriciaTrieReadingUtils::NodeFlags lastFlags =
PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(
- mDictRoot, &lastCandidatePtNodePos);
+ mBuffer.data(), &lastCandidatePtNodePos);
const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
// We copy all the characters in this PtNode to the buffer
outCodePoints[wordPos] = lastChar;
if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) {
int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
int charCount = maxCodePointCount;
while (-1 != nextChar && --charCount > 0) {
outCodePoints[++wordPos] = nextChar;
nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(
- mDictRoot, &lastCandidatePtNodePos);
+ mBuffer.data(), codePointTable, &lastCandidatePtNodePos);
}
}
++wordPos;
// Now we only need to branch to the children address. Skip the probability if
// it's there, read pos, and break to resume the search at pos.
if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) {
- PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot,
+ PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(),
&lastCandidatePtNodePos);
}
pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
- mDictRoot, lastFlags, &lastCandidatePtNodePos);
+ mBuffer.data(), lastFlags, &lastCandidatePtNodePos);
break;
} else {
// Here is a little tricky part: we come here if we found out that all children
@@ -219,18 +230,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// ready to start the next one.
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
- mDictRoot, flags, &pos);
+ mBuffer.data(), flags, &pos);
}
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
mShortcutListPolicy.skipAllShortcuts(&pos);
}
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
if (!mBigramListPolicy.skipAllBigrams(&pos)) {
- AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize,
+ AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(),
pos);
mIsCorrupted = true;
ASSERT(false);
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
}
@@ -243,17 +253,16 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
// our pos is after the end of this PtNode, at the start of the next one.
if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) {
PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(
- mDictRoot, flags, &pos);
+ mBuffer.data(), flags, &pos);
}
if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) {
mShortcutListPolicy.skipAllShortcuts(&pos);
}
if (PatriciaTrieReadingUtils::hasBigrams(flags)) {
if (!mBigramListPolicy.skipAllBigrams(&pos)) {
- AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize, pos);
+ AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos);
mIsCorrupted = true;
ASSERT(false);
- *outUnigramProbability = NOT_A_PROBABILITY;
return 0;
}
}
@@ -267,18 +276,48 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
}
// This function gets the position of the terminal PtNode of the exact matching word in the
-// dictionary. If no match is found, it returns NOT_A_DICT_POS.
-int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const {
+// dictionary. If no match is found, it returns NOT_A_WORD_ID.
+int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader);
readingHelper.initWithPtNodeArrayPos(getRootPosition());
- const int ptNodePos =
- readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch);
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
if (readingHelper.isError()) {
mIsCorrupted = true;
- AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ AKLOGE("Dictionary reading error in getWordId().");
+ }
+ return getWordIdFromTerminalPtNodePos(ptNodePos);
+}
+
+const WordAttributes PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
+ }
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
+ const PtNodeParams ptNodeParams =
+ mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (multiBigramMap) {
+ const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */,
+ prevWordIds, wordId, ptNodeParams.getProbability());
+ return getWordAttributes(probability, ptNodeParams);
+ }
+ if (!prevWordIds.empty()) {
+ const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId);
+ if (bigramProbability != NOT_A_PROBABILITY) {
+ return getWordAttributes(bigramProbability, ptNodeParams);
+ }
}
- return ptNodePos;
+ return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY),
+ ptNodeParams);
+}
+
+const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const {
+ return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(),
+ ptNodeParams.isPossiblyOffensive());
}
int PatriciaTriePolicy::getProbability(const int unigramProbability,
@@ -297,21 +336,22 @@ int PatriciaTriePolicy::getProbability(const int unigramProbability,
}
}
-int PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos,
- const int ptNodePos) const {
- if (ptNodePos == NOT_A_DICT_POS) {
+int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
return NOT_A_PROBABILITY;
}
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
const PtNodeParams ptNodeParams =
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) {
- // If this is not a word, or if it's a blacklisted entry, it should behave as
- // having no probability outside of the suggestion process (where it should be used
- // for shortcuts).
+ if (ptNodeParams.isNotAWord()) {
+ // If this is not a word, it should behave as having no probability outside of the
+ // suggestion process (where it should be used for shortcuts).
return NOT_A_PROBABILITY;
}
- if (prevWordsPtNodePos) {
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
+ if (!prevWordIds.empty()) {
+ const int bigramsPosition = getBigramsPositionOfPtNode(
+ getTerminalPtNodePosFromWordId(prevWordIds[0]));
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
while (bigramsIt.hasNext()) {
bigramsIt.next();
@@ -325,19 +365,26 @@ int PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodeP
return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
}
-void PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos,
+void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const {
- if (!prevWordsPtNodePos) {
+ if (prevWordIds.empty()) {
return;
}
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
+ const int bigramsPosition = getBigramsPositionOfPtNode(
+ getTerminalPtNodePosFromWordId(prevWordIds[0]));
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition);
while (bigramsIt.hasNext()) {
bigramsIt.next();
- listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos());
+ listener->onVisitEntry(bigramsIt.getProbability(),
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()));
}
}
+BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId));
+ return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos);
+}
+
int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
if (ptNodePos == NOT_A_DICT_POS) {
return NOT_A_DICT_POS;
@@ -362,35 +409,32 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod
int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
- PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(),
- &mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
- &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
+ const int *const codePointTable = mHeaderPolicy.getCodePointTable();
+ PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy,
+ &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount,
+ mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos,
+ &siblingPos);
// Skip PtNodes don't start with Unicode code point because they represent non-word information.
if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) {
- childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability,
- PatriciaTrieReadingUtils::isTerminal(flags),
- PatriciaTrieReadingUtils::hasChildrenInFlags(flags),
- PatriciaTrieReadingUtils::isBlacklisted(flags)
- || PatriciaTrieReadingUtils::isNotAWord(flags),
- mergedNodeCodePointCount, mergedNodeCodePoints);
+ const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId,
+ CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount));
}
return siblingPos;
}
-const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints,
- const int codePointCount) const {
- const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
- false /* forceLowerCaseSearch */);
- if (ptNodePos == NOT_A_DICT_POS) {
+const WordProperty PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
AKLOGE("getWordProperty was called for invalid word.");
return WordProperty();
}
+ const int ptNodePos = getTerminalPtNodePosFromWordId(wordId);
const PtNodeParams ptNodeParams =
mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
- ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
// Fetch bigram information.
- std::vector<BigramProperty> bigrams;
+ std::vector<NgramProperty> ngrams;
const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
int bigramWord1CodePoints[MAX_WORD_LENGTH];
BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos);
@@ -401,13 +445,14 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) {
int word1Probability = NOT_A_PROBABILITY;
const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
- bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints,
- &word1Probability);
- const std::vector<int> word1(bigramWord1CodePoints,
- bigramWord1CodePoints + word1CodePointCount);
+ getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH,
+ bigramWord1CodePoints, &word1Probability);
const int probability = getProbability(word1Probability, bigramsIt.getProbability());
- bigrams.emplace_back(&word1, probability,
- NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */);
+ ngrams.emplace_back(
+ NgramContext(wordCodePoints.data(), wordCodePoints.size(),
+ ptNodeParams.representsBeginningOfSentence()),
+ CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(),
+ probability, HistoricalInfo());
}
}
// Fetch shortcut information.
@@ -415,25 +460,25 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin
int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
if (shortcutPos != NOT_A_DICT_POS) {
int shortcutTargetCodePoints[MAX_WORD_LENGTH];
- ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos);
+ ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos);
bool hasNext = true;
while (hasNext) {
const ShortcutListReadingUtils::ShortcutFlags shortcutFlags =
- ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos);
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos);
hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags);
const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget(
- mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
- const std::vector<int> shortcutTarget(shortcutTargetCodePoints,
- shortcutTargetCodePoints + shortcutTargetLength);
+ mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos);
const int shortcutProbability =
ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags);
- shortcuts.emplace_back(&shortcutTarget, shortcutProbability);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(),
+ shortcutProbability);
}
}
const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
- ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
- NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
- return WordProperty(&codePointVector, &unigramProperty, &bigrams);
+ ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(),
+ ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
}
int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
@@ -455,9 +500,8 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
return 0;
}
const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
- int unigramProbability = NOT_A_PROBABILITY;
- *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos,
- MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(
+ getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints);
const int nextToken = token + 1;
if (nextToken >= terminalPtNodePositionsVectorSize) {
// All words have been iterated.
@@ -467,4 +511,16 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC
return nextToken;
}
+int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const {
+ return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos;
+}
+
+int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const {
+ return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId;
+}
+
+bool PatriciaTriePolicy::isValidPos(const int pos) const {
+ return pos >= 0 && pos < static_cast<int>(mBuffer.size());
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h
index 70351d147..8edfa7d10 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h
+++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h
@@ -21,35 +21,36 @@
#include <vector>
#include "defines.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/v2/bigram/bigram_list_policy.h"
+#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h"
+#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
+#include "dictionary/structure/v2/ver2_pt_node_array_reader.h"
+#include "dictionary/utils/format_utils.h"
+#include "dictionary/utils/mmapped_buffer.h"
#include "utils/byte_array_view.h"
+#include "utils/int_array_view.h"
namespace latinime {
class DicNode;
class DicNodeVector;
+// Word id = Position of a PtNode that represents the word.
+// Max supported n-gram is bigram.
class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public:
PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer)
: mMmappedBuffer(std::move(mmappedBuffer)),
mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(),
- FormatUtils::VERSION_2),
- mDictRoot(mMmappedBuffer->getReadOnlyByteArrayView().data()
- + mHeaderPolicy.getSize()),
- mDictBufferSize(mMmappedBuffer->getReadOnlyByteArrayView().size()
- - mHeaderPolicy.getSize()),
- mBigramListPolicy(mDictRoot, mDictBufferSize), mShortcutListPolicy(mDictRoot),
- mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy),
- mPtNodeArrayReader(mDictRoot, mDictBufferSize),
- mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}
+ FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())),
+ mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())),
+ mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer),
+ mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy,
+ mHeaderPolicy.getCodePointTable()),
+ mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(),
+ mIsCorrupted(false) {}
AK_FORCE_INLINE int getRootPosition() const {
return 0;
@@ -58,57 +59,62 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
void createAndGetAllChildDicNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const;
- int getCodePointsAndProbabilityAndReturnCodePointCount(
- const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const;
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
+
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
- int getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const;
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
int getProbability(const int unigramProbability, const int bigramProbability) const;
- int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const;
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
- void iterateNgramEntries(const int *const prevWordsPtNodePos,
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const;
- int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
return &mHeaderPolicy;
}
- const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
- return &mShortcutListPolicy;
- }
-
- bool addUnigramEntry(const int *const word, const int length,
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
const UnigramProperty *const unigramProperty) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
return false;
}
- bool removeUnigramEntry(const int *const word, const int length) {
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
return false;
}
- bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty) {
+ bool addNgramEntry(const NgramProperty *const ngramProperty) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
return false;
}
- bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
- const int length) {
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
return false;
}
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo) {
+ // This method should not be called for non-updatable dictionary.
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+
bool flush(const char *const filePath) {
// This method should not be called for non-updatable dictionary.
AKLOGI("Warning: flush() is called for non-updatable dictionary.");
@@ -135,8 +141,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
}
}
- const WordProperty getWordProperty(const int *const codePoints,
- const int codePointCount) const;
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
@@ -150,8 +155,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const MmappedBuffer::MmappedBufferPtr mMmappedBuffer;
const HeaderPolicy mHeaderPolicy;
- const uint8_t *const mDictRoot;
- const int mDictBufferSize;
+ const ReadOnlyByteArrayView mBuffer;
const BigramListPolicy mBigramListPolicy;
const ShortcutListPolicy mShortcutListPolicy;
const Ver2ParticiaTrieNodeReader mPtNodeReader;
@@ -159,9 +163,18 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
mutable bool mIsCorrupted;
+ int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints,
+ int *const outUnigramProbability) const;
+ int getShortcutPositionOfPtNode(const int ptNodePos) const;
int getBigramsPositionOfPtNode(const int ptNodePos) const;
int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos,
DicNodeVector *const childDicNodes) const;
+ int getWordIdFromTerminalPtNodePos(const int ptNodePos) const;
+ int getTerminalPtNodePosFromWordId(const int wordId) const;
+ const WordAttributes getWordAttributes(const int probability,
+ const PtNodeParams &ptNodeParams) const;
+ bool isValidPos(const int pos) const;
};
} // namespace latinime
#endif // LATINIME_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h
index 8e16ccc05..995b1ed01 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h
+++ b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h
@@ -20,15 +20,15 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "utils/byte_array_view.h"
namespace latinime {
class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
public:
- explicit ShortcutListPolicy(const uint8_t *const shortcutBuf)
- : mShortcutsBuf(shortcutBuf) {}
+ explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}
~ShortcutListPolicy() {}
@@ -37,7 +37,7 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
return NOT_A_DICT_POS;
}
int listPos = pos;
- ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mShortcutsBuf, &listPos);
+ ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos);
return listPos;
}
@@ -45,7 +45,7 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext,
int *const pos) const {
const ShortcutListReadingUtils::ShortcutFlags flags =
- ShortcutListReadingUtils::getFlagsAndForwardPointer(mShortcutsBuf, pos);
+ ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos);
if (outHasNext) {
*outHasNext = ShortcutListReadingUtils::hasNext(flags);
}
@@ -54,20 +54,20 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy {
}
if (outCodePoint) {
*outCodePointCount = ShortcutListReadingUtils::readShortcutTarget(
- mShortcutsBuf, maxCodePointCount, outCodePoint, pos);
+ mBuffer, maxCodePointCount, outCodePoint, pos);
}
}
void skipAllShortcuts(int *const pos) const {
const int shortcutListSize = ShortcutListReadingUtils
- ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos);
+ ::getShortcutListSizeAndForwardPointer(mBuffer, pos);
*pos += shortcutListSize;
}
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy);
- const uint8_t *const mShortcutsBuf;
+ const ReadOnlyByteArrayView mBuffer;
};
} // namespace latinime
#endif // LATINIME_SHORTCUT_LIST_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
index c1e938710..cbb8ead81 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
+++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp
@@ -14,18 +14,18 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
+#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
namespace latinime {
const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos(
const int ptNodePos) const {
- if (ptNodePos < 0 || ptNodePos >= mDictSize) {
+ if (ptNodePos < 0 || ptNodePos >= static_cast<int>(mBuffer.size())) {
// Reading invalid position because of bug or broken dictionary.
- AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d",
- ptNodePos, mDictSize);
+ AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd",
+ ptNodePos, mBuffer.size());
ASSERT(false);
return PtNodeParams();
}
@@ -37,9 +37,9 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo
int shortcutPos = NOT_A_DICT_POS;
int bigramPos = NOT_A_DICT_POS;
int siblingPos = NOT_A_DICT_POS;
- PatriciaTrieReadingUtils::readPtNodeInfo(mDictBuffer, ptNodePos, mShortuctPolicy,
- mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability,
- &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
+ PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy,
+ mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints,
+ &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos);
if (mergedNodeCodePointCount <= 0) {
AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount);
ASSERT(false);
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
index f0725b66d..dc87c7c68 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
+++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h
@@ -20,8 +20,9 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
+#include "utils/byte_array_view.h"
namespace latinime {
@@ -30,21 +31,22 @@ class DictionaryShortcutsStructurePolicy;
class Ver2ParticiaTrieNodeReader : public PtNodeReader {
public:
- Ver2ParticiaTrieNodeReader(const uint8_t *const dictBuffer, const int dictSize,
+ Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer,
const DictionaryBigramsStructurePolicy *const bigramPolicy,
- const DictionaryShortcutsStructurePolicy *const shortcutPolicy)
- : mDictBuffer(dictBuffer), mDictSize(dictSize), mBigramPolicy(bigramPolicy),
- mShortuctPolicy(shortcutPolicy) {}
+ const DictionaryShortcutsStructurePolicy *const shortcutPolicy,
+ const int *const codePointTable)
+ : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy),
+ mCodePointTable(codePointTable) {}
virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader);
- const uint8_t *const mDictBuffer;
- const int mDictSize;
+ const ReadOnlyByteArrayView mBuffer;
const DictionaryBigramsStructurePolicy *const mBigramPolicy;
- const DictionaryShortcutsStructurePolicy *const mShortuctPolicy;
+ const DictionaryShortcutsStructurePolicy *const mShortcutPolicy;
+ const int *const mCodePointTable;
};
} // namespace latinime
#endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp
index b46617d96..8b9b02df1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp
+++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp
@@ -14,24 +14,24 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h"
+#include "dictionary/structure/v2/ver2_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
namespace latinime {
bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
int *const outPtNodeCount, int *const outFirstPtNodePos) const {
- if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mDictSize) {
+ if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast<int>(mBuffer.size())) {
// Reading invalid position because of a bug or a broken dictionary.
- AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d",
- ptNodeArrayPos, mDictSize);
+ AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd",
+ ptNodeArrayPos, mBuffer.size());
ASSERT(false);
return false;
}
int readingPos = ptNodeArrayPos;
const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(
- mDictBuffer, &readingPos);
+ mBuffer.data(), &readingPos);
*outPtNodeCount = ptNodeCountInArray;
*outFirstPtNodePos = readingPos;
return true;
@@ -39,10 +39,10 @@ bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNode
bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos,
int *const outNextPtNodeArrayPos) const {
- if (forwordLinkPos < 0 || forwordLinkPos >= mDictSize) {
+ if (forwordLinkPos < 0 || forwordLinkPos >= static_cast<int>(mBuffer.size())) {
// Reading invalid position because of bug or broken dictionary.
- AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d",
- forwordLinkPos, mDictSize);
+ AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd",
+ forwordLinkPos, mBuffer.size());
ASSERT(false);
return false;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h
index 548272148..32fa96d15 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h
+++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h
@@ -20,14 +20,14 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "utils/byte_array_view.h"
namespace latinime {
class Ver2PtNodeArrayReader : public PtNodeArrayReader {
public:
- Ver2PtNodeArrayReader(const uint8_t *const dictBuffer, const int dictSize)
- : mDictBuffer(dictBuffer), mDictSize(dictSize) {};
+ Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {};
virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos,
int *const outPtNodeCount, int *const outFirstPtNodePos) const;
@@ -37,8 +37,7 @@ class Ver2PtNodeArrayReader : public PtNodeArrayReader {
private:
DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader);
- const uint8_t *const mDictBuffer;
- const int mDictSize;
+ const ReadOnlyByteArrayView mBuffer;
};
} // namespace latinime
#endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
new file mode 100644
index 000000000..165947f87
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+
+namespace latinime {
+
+// Used to provide stable probabilities even if the user's input count is small.
+const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1};
+
+// Encoded backoff weights.
+// Note that we give positive values for trigrams and quadgrams that means the weight is more than
+// 1.
+// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight.
+const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8};
+
+// This value is used to remove too old entries from the dictionary.
+const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS =
+ 300 * 24 * 60 * 60; // 300 days
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
new file mode 100644
index 000000000..71824c954
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H
+
+#include <algorithm>
+
+#include "defines.h"
+#include "dictionary/property/historical_info.h"
+#include "utils/ngram_utils.h"
+#include "utils/time_keeper.h"
+
+namespace latinime {
+
+class DynamicLanguageModelProbabilityUtils {
+ public:
+ static float computeRawProbabilityFromCounts(const int count, const int contextCount,
+ const NgramType ngramType) {
+ const int minCount = ASSUMED_MIN_COUNTS[static_cast<int>(ngramType)];
+ return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount));
+ }
+
+ static float backoff(const int ngramProbability, const NgramType ngramType) {
+ const int probability =
+ ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast<int>(ngramType)];
+ return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY);
+ }
+
+ static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) {
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ if (elapsedTime < 0) {
+ AKLOGE("The elapsed time is negatime value. Timestamp overflow?");
+ return NOT_A_PROBABILITY;
+ }
+ // TODO: Improve this logic.
+ // We don't modify probability depending on the elapsed time.
+ return probability;
+ }
+
+ static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp();
+ return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+ }
+
+ static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) {
+ // TODO: Improve this logic.
+ // More recently input entries get higher priority.
+ return historicalInfo.getTimestamp();
+ }
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils);
+
+ static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram.");
+
+ static const int ASSUMED_MIN_COUNTS[];
+ static const int ENCODED_BACKOFF_WEIGHTS[];
+ static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS;
+};
+
+} // namespace latinime
+#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp
new file mode 100644
index 000000000..c10e4906b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp
@@ -0,0 +1,478 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+
+#include <algorithm>
+#include <cstring>
+
+#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0;
+const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1;
+
+bool LanguageModelDictContent::save(FILE *const file) const {
+ return mTrieMap.save(file) && mGlobalCounters.save(file);
+}
+
+bool LanguageModelDictContent::runGC(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const LanguageModelDictContent *const originalContent) {
+ return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(),
+ 0 /* nextLevelBitmapEntryIndex */);
+}
+
+const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds,
+ const int wordId, const bool mustMatchAllPrevWords,
+ const HeaderPolicy *const headerPolicy) const {
+ int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
+ bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex();
+ int maxPrevWordCount = 0;
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ const int nextBitmapEntryIndex =
+ mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex;
+ if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ break;
+ }
+ maxPrevWordCount = i + 1;
+ bitmapEntryIndices[i + 1] = nextBitmapEntryIndex;
+ }
+
+ const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
+ if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) {
+ // The word should be treated as a invalid word.
+ return WordAttributes();
+ }
+ for (int i = maxPrevWordCount; i >= 0; --i) {
+ if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) {
+ break;
+ }
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]);
+ if (!result.mIsValid) {
+ continue;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
+ int probability = NOT_A_PROBABILITY;
+ if (mHasHistoricalInfo) {
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ int contextCount = 0;
+ if (i == 0) {
+ // unigram
+ contextCount = mGlobalCounters.getTotalCount();
+ } else {
+ const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry(
+ prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]);
+ if (!prevWordProbabilityEntry.isValid()) {
+ continue;
+ }
+ if (prevWordProbabilityEntry.representsBeginningOfSentence()
+ && historicalInfo->getCount() == 1) {
+ // BoS ngram requires multiple contextCount.
+ continue;
+ }
+ contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount();
+ }
+ const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1);
+ const float rawProbability =
+ DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts(
+ historicalInfo->getCount(), contextCount, ngramType);
+ const int encodedRawProbability =
+ ProbabilityUtils::encodeRawProbability(rawProbability);
+ const int decayedProbability =
+ DynamicLanguageModelProbabilityUtils::getDecayedProbability(
+ encodedRawProbability, *historicalInfo);
+ probability = DynamicLanguageModelProbabilityUtils::backoff(
+ decayedProbability, ngramType);
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
+ // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
+ // probabilityEntry.
+ return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
+ unigramProbabilityEntry.isNotAWord(),
+ unigramProbabilityEntry.isPossiblyOffensive());
+ }
+ // Cannot find the word.
+ return WordAttributes();
+}
+
+ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
+ const WordIdArrayView prevWordIds, const int wordId) const {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ return ProbabilityEntry();
+ }
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
+ if (!result.mIsValid) {
+ // Not found.
+ return ProbabilityEntry();
+ }
+ return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
+}
+
+bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId, const ProbabilityEntry *const probabilityEntry) {
+ if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ return false;
+ }
+ const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ return false;
+ }
+ return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex);
+}
+
+bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId) {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ // Cannot find bitmap entry for the probability entry. The entry doesn't exist.
+ return false;
+ }
+ return mTrieMap.remove(wordId, bitmapEntryIndex);
+}
+
+LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries(
+ const WordIdArrayView prevWordIds) const {
+ const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
+ return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo);
+}
+
+std::vector<LanguageModelDictContent::DumppedFullEntryInfo>
+ LanguageModelDictContent::exportAllNgramEntriesRelatedToWord(
+ const HeaderPolicy *const headerPolicy, const int wordId) const {
+ const TrieMap::Result result = mTrieMap.getRoot(wordId);
+ if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) {
+ // The word doesn't have any related ngram entries.
+ return std::vector<DumppedFullEntryInfo>();
+ }
+ std::vector<int> prevWordIds = { wordId };
+ std::vector<DumppedFullEntryInfo> entries;
+ exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex,
+ &prevWordIds, &entries);
+ return entries;
+}
+
+void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner(
+ const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex,
+ std::vector<int> *const prevWordIds,
+ std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const {
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ const int wordId = entry.key();
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ if (probabilityEntry.isValid()) {
+ const WordAttributes wordAttributes = getWordAttributes(
+ WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */,
+ headerPolicy);
+ outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId,
+ wordAttributes, probabilityEntry);
+ }
+ if (entry.hasNextLevelMap()) {
+ prevWordIds->push_back(wordId);
+ exportAllNgramEntriesRelatedToWordInner(headerPolicy,
+ entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo);
+ prevWordIds->pop_back();
+ }
+ }
+}
+
+bool LanguageModelDictContent::truncateEntries(const EntryCounts &currentEntryCounts,
+ const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const outEntryCounters) {
+ for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) {
+ const int totalWordCount = prevWordCount + 1;
+ const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount);
+ if (currentEntryCounts.getNgramCount(ngramType)
+ <= maxEntryCounts.getNgramCount(ngramType)) {
+ outEntryCounters->setNgramCount(ngramType,
+ currentEntryCounts.getNgramCount(ngramType));
+ continue;
+ }
+ int entryCount = 0;
+ if (!turncateEntriesInSpecifiedLevel(headerPolicy,
+ maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) {
+ return false;
+ }
+ outEntryCounters->setNgramCount(ngramType, entryCount);
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds,
+ const int wordId, const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) {
+ if (!mHasHistoricalInfo) {
+ AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info.");
+ return false;
+ }
+ const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId);
+ const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom(
+ originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy);
+ if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) {
+ return false;
+ }
+ mGlobalCounters.incrementTotalCount();
+ mGlobalCounters.updateMaxValueOfCounters(
+ updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount());
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ if (prevWordIds[i] == NOT_A_WORD_ID) {
+ break;
+ }
+ // TODO: Optimize this code.
+ const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1);
+ const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry(
+ limitedPrevWordIds, wordId);
+ const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom(
+ originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy);
+ if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) {
+ return false;
+ }
+ mGlobalCounters.updateMaxValueOfCounters(
+ updatedNgramProbabilityEntry.getHistoricalInfo()->getCount());
+ if (!originalNgramProbabilityEntry.isValid()) {
+ // (i + 2) words are used in total because the prevWords consists of (i + 1) words when
+ // looking at its i-th element.
+ entryCountersToUpdate->incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(i + 2));
+ }
+ }
+ return true;
+}
+
+const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom(
+ const ProbabilityEntry &originalProbabilityEntry, const bool isValid,
+ const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const {
+ const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(),
+ 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount()
+ + historicalInfo.getCount());
+ if (originalProbabilityEntry.isValid()) {
+ return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo);
+ } else {
+ return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo);
+ }
+}
+
+bool LanguageModelDictContent::runGCInner(
+ const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) {
+ for (auto &entry : trieMapRange) {
+ const auto it = terminalIdMap->find(entry.key());
+ if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) {
+ // The word has been removed.
+ continue;
+ }
+ if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) {
+ return false;
+ }
+ if (entry.hasNextLevelMap()) {
+ if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(),
+ mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) {
+ int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
+ for (const int wordId : prevWordIds) {
+ const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex);
+ if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) {
+ lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
+ continue;
+ }
+ if (!result.mIsValid) {
+ if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo),
+ lastBitmapEntryIndex)) {
+ AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId,
+ lastBitmapEntryIndex);
+ return TrieMap::INVALID_INDEX;
+ }
+ }
+ lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId,
+ lastBitmapEntryIndex);
+ }
+ return lastBitmapEntryIndex;
+}
+
+int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const {
+ int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
+ for (const int wordId : prevWordIds) {
+ const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
+ if (!result.mIsValid) {
+ return TrieMap::INVALID_INDEX;
+ }
+ bitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
+ }
+ return bitmapEntryIndex;
+}
+
+bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex,
+ const int prevWordCount, const HeaderPolicy *const headerPolicy,
+ const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) {
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
+ AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.",
+ prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM);
+ return false;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ if (prevWordCount > 0 && probabilityEntry.isValid()
+ && !mTrieMap.getRoot(entry.key()).mIsValid) {
+ // The entry is related to a word that has been removed. Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ if (mHasHistoricalInfo && probabilityEntry.isValid()) {
+ const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo();
+ if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC(
+ *originalHistoricalInfo)) {
+ // Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ if (needsToHalveCounters) {
+ const int updatedCount = originalHistoricalInfo->getCount() / 2;
+ if (updatedCount == 0) {
+ // Remove the entry.
+ if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) {
+ return false;
+ }
+ continue;
+ }
+ const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(),
+ originalHistoricalInfo->getLevel(), updatedCount);
+ const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(),
+ &historicalInfoToSave);
+ if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo),
+ bitmapEntryIndex)) {
+ return false;
+ }
+ }
+ }
+ outEntryCounters->incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1));
+ if (!entry.hasNextLevelMap()) {
+ continue;
+ }
+ if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(),
+ prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel(
+ const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel,
+ int *const outEntryCount) {
+ std::vector<int> prevWordIds;
+ std::vector<EntryInfoToTurncate> entryInfoVector;
+ if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(),
+ &prevWordIds, &entryInfoVector)) {
+ return false;
+ }
+ if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) {
+ *outEntryCount = static_cast<int>(entryInfoVector.size());
+ return true;
+ }
+ *outEntryCount = maxEntryCount;
+ const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount;
+ std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove,
+ entryInfoVector.end(),
+ EntryInfoToTurncate::Comparator());
+ for (int i = 0; i < entryCountToRemove; ++i) {
+ const EntryInfoToTurncate &entryInfo = entryInfoVector[i];
+ if (!removeNgramProbabilityEntry(
+ WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount),
+ entryInfo.mKey)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy,
+ const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<EntryInfoToTurncate> *const outEntryInfo) const {
+ const int prevWordCount = prevWordIds->size();
+ for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) {
+ if (prevWordCount < targetLevel) {
+ if (!entry.hasNextLevelMap()) {
+ continue;
+ }
+ prevWordIds->push_back(entry.key());
+ if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(),
+ prevWordIds, outEntryInfo)) {
+ return false;
+ }
+ prevWordIds->pop_back();
+ continue;
+ }
+ const ProbabilityEntry probabilityEntry =
+ ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo);
+ const int priority = mHasHistoricalInfo
+ ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction(
+ *probabilityEntry.getHistoricalInfo())
+ : probabilityEntry.getProbability();
+ outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(),
+ entry.key(), targetLevel, prevWordIds->data());
+ }
+ return true;
+}
+
+bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()(
+ const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const {
+ if (left.mPriority != right.mPriority) {
+ return left.mPriority < right.mPriority;
+ }
+ if (left.mCount != right.mCount) {
+ return left.mCount < right.mCount;
+ }
+ if (left.mKey != right.mKey) {
+ return left.mKey < right.mKey;
+ }
+ if (left.mPrevWordCount != right.mPrevWordCount) {
+ return left.mPrevWordCount > right.mPrevWordCount;
+ }
+ for (int i = 0; i < left.mPrevWordCount; ++i) {
+ if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) {
+ return left.mPrevWordIds[i] < right.mPrevWordIds[i];
+ }
+ }
+ // left and rigth represent the same entry.
+ return false;
+}
+
+LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority,
+ const int count, const int key, const int prevWordCount, const int *const prevWordIds)
+ : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) {
+ memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0]));
+}
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h
new file mode 100644
index 000000000..db8c6e12b
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
+#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
+
+#include <cstdio>
+#include <vector>
+
+#include "defines.h"
+#include "dictionary/property/word_attributes.h"
+#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/trie_map.h"
+#include "utils/byte_array_view.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+
+class HeaderPolicy;
+
+/**
+ * Class representing language model.
+ *
+ * This class provides methods to get and store unigram/n-gram probability information and flags.
+ */
+class LanguageModelDictContent {
+ public:
+ // Pair of word id and probability entry used for iteration.
+ class WordIdAndProbabilityEntry {
+ public:
+ WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry)
+ : mWordId(wordId), mProbabilityEntry(probabilityEntry) {}
+
+ int getWordId() const { return mWordId; }
+ const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry);
+ DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry);
+
+ const int mWordId;
+ const ProbabilityEntry mProbabilityEntry;
+ };
+
+ // Iterator.
+ class EntryIterator {
+ public:
+ EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator,
+ const bool hasHistoricalInfo)
+ : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ const WordIdAndProbabilityEntry operator*() const {
+ const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator;
+ return WordIdAndProbabilityEntry(
+ result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo));
+ }
+
+ bool operator!=(const EntryIterator &other) const {
+ return mTrieMapIterator != other.mTrieMapIterator;
+ }
+
+ const EntryIterator &operator++() {
+ ++mTrieMapIterator;
+ return *this;
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator);
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator);
+
+ TrieMap::TrieMapIterator mTrieMapIterator;
+ const bool mHasHistoricalInfo;
+ };
+
+ // Class represents range to use range base for loops.
+ class EntryRange {
+ public:
+ EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo)
+ : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ EntryIterator begin() const {
+ return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo);
+ }
+
+ EntryIterator end() const {
+ return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo);
+ }
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange);
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryRange);
+
+ const TrieMap::TrieMapRange mTrieMapRange;
+ const bool mHasHistoricalInfo;
+ };
+
+ class DumppedFullEntryInfo {
+ public:
+ DumppedFullEntryInfo(std::vector<int> &prevWordIds, const int targetWordId,
+ const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry)
+ : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId),
+ mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {}
+
+ const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); }
+ int getTargetWordId() const { return mTargetWordId; }
+ const WordAttributes &getWordAttributes() const { return mWordAttributes; }
+ const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo);
+
+ const std::vector<int> mPrevWordIds;
+ const int mTargetWordId;
+ const WordAttributes mWordAttributes;
+ const ProbabilityEntry mProbabilityEntry;
+ };
+
+ LanguageModelDictContent(const ReadWriteByteArrayView *const buffers,
+ const bool hasHistoricalInfo)
+ : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]),
+ mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]),
+ mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ explicit LanguageModelDictContent(const bool hasHistoricalInfo)
+ : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {}
+
+ bool isNearSizeLimit() const {
+ return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters();
+ }
+
+ bool save(FILE *const file) const;
+
+ bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const LanguageModelDictContent *const originalContent);
+
+ const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId,
+ const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const;
+
+ ProbabilityEntry getProbabilityEntry(const int wordId) const {
+ return getNgramProbabilityEntry(WordIdArrayView(), wordId);
+ }
+
+ bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) {
+ mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount());
+ return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry);
+ }
+
+ bool removeProbabilityEntry(const int wordId) {
+ return removeNgramProbabilityEntry(WordIdArrayView(), wordId);
+ }
+
+ ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds,
+ const int wordId) const;
+
+ bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId,
+ const ProbabilityEntry *const probabilityEntry);
+
+ bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId);
+
+ EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const;
+
+ std::vector<DumppedFullEntryInfo> exportAllNgramEntriesRelatedToWord(
+ const HeaderPolicy *const headerPolicy, const int wordId) const;
+
+ bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const outEntryCounters) {
+ if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(),
+ 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(),
+ outEntryCounters)) {
+ return false;
+ }
+ if (mGlobalCounters.needsToHalveCounters()) {
+ mGlobalCounters.halveCounters();
+ }
+ return true;
+ }
+
+ // entryCounts should be created by updateAllProbabilityEntries.
+ bool truncateEntries(const EntryCounts &currentEntryCounts, const EntryCounts &maxEntryCounts,
+ const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters);
+
+ bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId,
+ const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy,
+ MutableEntryCounters *const entryCountersToUpdate);
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent);
+
+ class EntryInfoToTurncate {
+ public:
+ class Comparator {
+ public:
+ bool operator()(const EntryInfoToTurncate &left,
+ const EntryInfoToTurncate &right) const;
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(Comparator);
+ };
+
+ EntryInfoToTurncate(const int priority, const int count, const int key,
+ const int prevWordCount, const int *const prevWordIds);
+
+ int mPriority;
+ // TODO: Remove.
+ int mCount;
+ int mKey;
+ int mPrevWordCount;
+ int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1];
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate);
+ };
+
+ static const int TRIE_MAP_BUFFER_INDEX;
+ static const int GLOBAL_COUNTERS_BUFFER_INDEX;
+
+ TrieMap mTrieMap;
+ LanguageModelDictContentGlobalCounters mGlobalCounters;
+ const bool mHasHistoricalInfo;
+
+ bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
+ const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex);
+ int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds);
+ int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
+ bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount,
+ const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters,
+ MutableEntryCounters *const outEntryCounters);
+ bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy,
+ const int maxEntryCount, const int targetLevel, int *const outEntryCount);
+ bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel,
+ const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<EntryInfoToTurncate> *const outEntryInfo) const;
+ const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry,
+ const bool isValid, const HistoricalInfo historicalInfo,
+ const HeaderPolicy *const headerPolicy) const;
+ void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy,
+ const int bitmapEntryIndex, std::vector<int> *const prevWordIds,
+ std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const;
+};
+} // namespace latinime
+#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp
new file mode 100644
index 000000000..89cf0e306
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h"
+
+#include <climits>
+
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+
+namespace latinime {
+
+const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD =
+ (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64;
+const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30;
+const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4;
+const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0;
+const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1;
+
+} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
new file mode 100644
index 000000000..3f87c0ea0
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H
+#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H
+
+#include <cstdio>
+
+#include "defines.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "utils/byte_array_view.h"
+
+namespace latinime {
+
+class LanguageModelDictContentGlobalCounters {
+ public:
+ explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer)
+ : mBuffer(buffer, 0 /* maxAdditionalBufferSize */),
+ mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)),
+ mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {}
+
+ LanguageModelDictContentGlobalCounters()
+ : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {}
+
+ bool needsToHalveCounters() const {
+ return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD
+ || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD;
+ }
+
+ int getTotalCount() const {
+ return mTotalCount;
+ }
+
+ bool save(FILE *const file) const {
+ BufferWithExtendableBuffer bufferToWrite(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES,
+ TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) {
+ return false;
+ }
+ if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES,
+ MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) {
+ return false;
+ }
+ return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite);
+ }
+
+ void incrementTotalCount() {
+ mTotalCount += 1;
+ }
+
+ void addToTotalCount(const int count) {
+ mTotalCount += count;
+ }
+
+ void updateMaxValueOfCounters(const int count) {
+ mMaxValueOfCounters = std::max(count, mMaxValueOfCounters);
+ }
+
+ void halveCounters() {
+ mMaxValueOfCounters /= 2;
+ mTotalCount /= 2;
+ }
+
+private:
+ DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters);
+
+ const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD;
+ const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD;
+ const static int COUNTER_SIZE_IN_BYTES;
+ const static int TOTAL_COUNT_INDEX;
+ const static int MAX_VALUE_OF_COUNTERS_INDEX;
+
+ BufferWithExtendableBuffer mBuffer;
+ int mTotalCount;
+ int mMaxValueOfCounters;
+
+ static int readValue(const BufferWithExtendableBuffer &buffer, const int index) {
+ const int pos = COUNTER_SIZE_IN_BYTES * index;
+ if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) {
+ return 0;
+ }
+ return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos);
+ }
+};
+} // namespace latinime
+#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/dictionary/structure/v4/content/probability_entry.h
index feff6b57f..473354b90 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h
+++ b/native/jni/src/dictionary/structure/v4/content/probability_entry.h
@@ -21,8 +21,10 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/historical_info.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
namespace latinime {
@@ -34,31 +36,40 @@ class ProbabilityEntry {
// Dummy entry
ProbabilityEntry()
- : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {}
+ : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY),
+ mHistoricalInfo() {}
// Entry without historical information
ProbabilityEntry(const int flags, const int probability)
: mFlags(flags), mProbability(probability), mHistoricalInfo() {}
// Entry with historical information.
- ProbabilityEntry(const int flags, const int probability,
- const HistoricalInfo *const historicalInfo)
- : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {}
-
- const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const {
- return ProbabilityEntry(mFlags, probability, &mHistoricalInfo);
- }
-
- const ProbabilityEntry createEntryWithUpdatedHistoricalInfo(
- const HistoricalInfo *const historicalInfo) const {
- return ProbabilityEntry(mFlags, mProbability, historicalInfo);
+ ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo)
+ : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {}
+
+ // Create from unigram property.
+ ProbabilityEntry(const UnigramProperty *const unigramProperty)
+ : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(),
+ unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(),
+ unigramProperty->isPossiblyOffensive())),
+ mProbability(unigramProperty->getProbability()),
+ mHistoricalInfo(unigramProperty->getHistoricalInfo()) {}
+
+ // Create from ngram property.
+ // TODO: Set flags.
+ ProbabilityEntry(const NgramProperty *const ngramProperty)
+ : mFlags(0), mProbability(ngramProperty->getProbability()),
+ mHistoricalInfo(ngramProperty->getHistoricalInfo()) {}
+
+ bool isValid() const {
+ return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0;
}
bool hasHistoricalInfo() const {
return mHistoricalInfo.isValid();
}
- int getFlags() const {
+ uint8_t getFlags() const {
return mFlags;
}
@@ -70,18 +81,34 @@ class ProbabilityEntry {
return &mHistoricalInfo;
}
+ bool representsBeginningOfSentence() const {
+ return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0;
+ }
+
+ bool isNotAWord() const {
+ return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0;
+ }
+
+ bool isBlacklisted() const {
+ return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0;
+ }
+
+ bool isPossiblyOffensive() const {
+ return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0;
+ }
+
uint64_t encode(const bool hasHistoricalInfo) const {
- uint64_t encodedEntry = static_cast<uint64_t>(mFlags);
+ uint64_t encodedEntry = static_cast<uint8_t>(mFlags);
if (hasHistoricalInfo) {
encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT))
- ^ static_cast<uint64_t>(mHistoricalInfo.getTimeStamp());
+ | static_cast<uint32_t>(mHistoricalInfo.getTimestamp());
encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT))
- ^ static_cast<uint64_t>(mHistoricalInfo.getLevel());
+ | static_cast<uint8_t>(mHistoricalInfo.getLevel());
encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT))
- ^ static_cast<uint64_t>(mHistoricalInfo.getCount());
+ | static_cast<uint16_t>(mHistoricalInfo.getCount());
} else {
encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT))
- ^ static_cast<uint64_t>(mProbability);
+ | static_cast<uint8_t>(mProbability);
}
return encodedEntry;
}
@@ -89,7 +116,7 @@ class ProbabilityEntry {
static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) {
if (hasHistoricalInfo) {
const int flags = readFromEncodedEntry(encodedEntry,
- Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE,
+ Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
Ver4DictConstants::TIME_STAMP_FIELD_SIZE
+ Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
+ Ver4DictConstants::WORD_COUNT_FIELD_SIZE);
@@ -103,10 +130,10 @@ class ProbabilityEntry {
const int count = readFromEncodedEntry(encodedEntry,
Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */);
const HistoricalInfo historicalInfo(timestamp, level, count);
- return ProbabilityEntry(flags, NOT_A_PROBABILITY, &historicalInfo);
+ return ProbabilityEntry(flags, &historicalInfo);
} else {
const int flags = readFromEncodedEntry(encodedEntry,
- Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE,
+ Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE,
Ver4DictConstants::PROBABILITY_SIZE);
const int probability = readFromEncodedEntry(encodedEntry,
Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */);
@@ -118,7 +145,7 @@ class ProbabilityEntry {
// Copy constructor is public to use this class as a type of return value.
DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry);
- const int mFlags;
+ const uint8_t mFlags;
const int mProbability;
const HistoricalInfo mHistoricalInfo;
@@ -126,6 +153,24 @@ class ProbabilityEntry {
return static_cast<int>(
(encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1));
}
+
+ static uint8_t createFlags(const bool representsBeginningOfSentence,
+ const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) {
+ uint8_t flags = 0;
+ if (representsBeginningOfSentence) {
+ flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
+ }
+ if (isNotAWord) {
+ flags |= Ver4DictConstants::FLAG_NOT_A_WORD;
+ }
+ if (isBlacklisted) {
+ flags |= Ver4DictConstants::FLAG_BLACKLISTED;
+ }
+ if (isPossiblyOffensive) {
+ flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE;
+ }
+ return flags;
+ }
};
} // namespace latinime
#endif /* LATINIME_PROBABILITY_ENTRY_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp
index 41d9c544c..e3b419449 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h"
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h
index 7b12aff16..27de4e79e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h
+++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h
@@ -17,21 +17,21 @@
#ifndef LATINIME_SHORTCUT_DICT_CONTENT_H
#define LATINIME_SHORTCUT_DICT_CONTENT_H
-#include <cstdint>
#include <cstdio>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/content/sparse_table_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
namespace latinime {
+class ReadWriteByteArrayView;
+
class ShortcutDictContent : public SparseTableDictContent {
public:
- ShortcutDictContent(uint8_t *const *buffers, const int *bufferSizes)
- : SparseTableDictContent(buffers, bufferSizes,
- Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
+ ShortcutDictContent(const ReadWriteByteArrayView *const buffers)
+ : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE,
Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {}
ShortcutDictContent()
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h
index 921774181..6faa9a28b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h
+++ b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h
@@ -17,22 +17,21 @@
#ifndef LATINIME_SINGLE_DICT_CONTENT_H
#define LATINIME_SINGLE_DICT_CONTENT_H
-#include <cstdint>
#include <cstdio>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
#include "utils/byte_array_view.h"
namespace latinime {
class SingleDictContent {
public:
- SingleDictContent(uint8_t *const buffer, const int bufferSize)
- : mExpandableContentBuffer(ReadWriteByteArrayView(buffer, bufferSize),
- BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {}
+ SingleDictContent(const ReadWriteByteArrayView buffer)
+ : mExpandableContentBuffer(buffer,
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {}
SingleDictContent()
: mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp
index 896ce6bd2..685365f36 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp
+++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h"
+#include "dictionary/structure/v4/content/sparse_table_dict_content.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h
index c98dd11fd..6245abc8e 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h
+++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h
@@ -17,13 +17,12 @@
#ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H
#define LATINIME_SPARSE_TABLE_DICT_CONTENT_H
-#include <cstdint>
#include <cstdio>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/sparse_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/sparse_table.h"
#include "utils/byte_array_view.h"
namespace latinime {
@@ -31,19 +30,13 @@ namespace latinime {
// TODO: Support multiple contents.
class SparseTableDictContent {
public:
- AK_FORCE_INLINE SparseTableDictContent(uint8_t *const *buffers, const int *bufferSizes,
+ AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers,
const int sparseTableBlockSize, const int sparseTableDataSize)
- : mExpandableLookupTableBuffer(
- ReadWriteByteArrayView(buffers[LOOKUP_TABLE_BUFFER_INDEX],
- bufferSizes[LOOKUP_TABLE_BUFFER_INDEX]),
+ : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX],
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
- mExpandableAddressTableBuffer(
- ReadWriteByteArrayView(buffers[ADDRESS_TABLE_BUFFER_INDEX],
- bufferSizes[ADDRESS_TABLE_BUFFER_INDEX]),
+ mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX],
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
- mExpandableContentBuffer(
- ReadWriteByteArrayView(buffers[CONTENT_BUFFER_INDEX],
- bufferSizes[CONTENT_BUFFER_INDEX]),
+ mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX],
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer,
sparseTableBlockSize, sparseTableDataSize) {}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
index cf238ee5f..5503151fd 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
+++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp
@@ -14,10 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
@@ -34,7 +33,7 @@ int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId)
bool TerminalPositionLookupTable::setTerminalPtNodePosition(
const int terminalId, const int terminalPtNodePos) {
if (terminalId < 0) {
- return NOT_A_DICT_POS;
+ return false;
}
while (terminalId >= mSize) {
// Write new entry.
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h
index b2262bf1e..f45ceb52d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h
+++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h
@@ -17,13 +17,13 @@
#ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H
#define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H
-#include <cstdint>
#include <cstdio>
#include <unordered_map>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/content/single_dict_content.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "utils/byte_array_view.h"
namespace latinime {
@@ -31,8 +31,8 @@ class TerminalPositionLookupTable : public SingleDictContent {
public:
typedef std::unordered_map<int, int> TerminalIdMap;
- TerminalPositionLookupTable(uint8_t *const buffer, const int bufferSize)
- : SingleDictContent(buffer, bufferSize),
+ TerminalPositionLookupTable(const ReadWriteByteArrayView buffer)
+ : SingleDictContent(buffer),
mSize(getBuffer()->getTailPosition()
/ Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
index 790273541..25ab22543 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
+++ b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h
@@ -18,10 +18,10 @@
#define LATINIME_VER4_SHORTCUT_LIST_POLICY_H
#include "defines.h"
-#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h"
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp
index 3c8008dc4..b0a82839b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
#include <cerrno>
#include <cstring>
@@ -23,9 +23,9 @@
#include <sys/types.h>
#include <vector>
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/file_utils.h"
#include "utils/byte_array_view.h"
namespace latinime {
@@ -45,16 +45,13 @@ namespace latinime {
if (!bodyBuffer) {
return Ver4DictBuffersPtr(nullptr);
}
- std::vector<uint8_t *> buffers;
- std::vector<int> bufferSizes;
+ std::vector<ReadWriteByteArrayView> buffers;
const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView();
int position = 0;
while (position < static_cast<int>(buffer.size())) {
const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition(
buffer.data(), &position);
- const ReadWriteByteArrayView subBuffer = buffer.subView(position, bufferSize);
- buffers.push_back(subBuffer.data());
- bufferSizes.push_back(subBuffer.size());
+ buffers.push_back(buffer.subView(position, bufferSize));
position += bufferSize;
if (bufferSize < 0 || position < 0 || position > static_cast<int>(buffer.size())) {
AKLOGE("The dict body file is corrupted.");
@@ -66,7 +63,7 @@ namespace latinime {
return Ver4DictBuffersPtr(nullptr);
}
return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer),
- formatVersion, buffers, bufferSizes));
+ formatVersion, buffers));
}
bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath,
@@ -162,11 +159,6 @@ bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const {
AKLOGE("Language model dict content cannot be written.");
return false;
}
- // Write bigram dict content.
- if (!mBigramDictContent.flushToFile(file)) {
- AKLOGE("Bigram dict content cannot be written.");
- return false;
- }
// Write shortcut dict content.
if (!mShortcutDictContent.flushToFile(file)) {
AKLOGE("Shortcut dict content cannot be written.");
@@ -178,29 +170,18 @@ bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const {
Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer,
MmappedBuffer::MmappedBufferPtr &&bodyBuffer,
const FormatUtils::FORMAT_VERSION formatVersion,
- const std::vector<uint8_t *> &contentBuffers, const std::vector<int> &contentBufferSizes)
+ const std::vector<ReadWriteByteArrayView> &contentBuffers)
: mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)),
mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion),
mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(),
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
- mExpandableTrieBuffer(
- ReadWriteByteArrayView(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX],
- contentBufferSizes[Ver4DictConstants::TRIE_BUFFER_INDEX]),
+ mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX],
BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE),
mTerminalPositionLookupTable(
- contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX],
- contentBufferSizes[
- Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]),
- mLanguageModelDictContent(
- ReadWriteByteArrayView(
- contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX],
- contentBufferSizes[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX]),
- mHeaderPolicy.hasHistoricalInfoOfWords()),
- mBigramDictContent(&contentBuffers[Ver4DictConstants::BIGRAM_BUFFERS_INDEX],
- &contentBufferSizes[Ver4DictConstants::BIGRAM_BUFFERS_INDEX],
+ contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]),
+ mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX],
mHeaderPolicy.hasHistoricalInfoOfWords()),
- mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX],
- &contentBufferSizes[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]),
+ mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]),
mIsUpdatable(mDictBuffer->isUpdatable()) {}
Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize)
@@ -208,7 +189,6 @@ Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const i
mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE),
mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(),
mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()),
- mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(),
- mIsUpdatable(true) {}
+ mShortcutDictContent(), mIsUpdatable(true) {}
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h
index 68027dcb8..c8270c93c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h
@@ -21,14 +21,13 @@
#include <memory>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+#include "dictionary/structure/v4/content/shortcut_dict_content.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/mmapped_buffer.h"
namespace latinime {
@@ -53,7 +52,6 @@ class Ver4DictBuffers {
return mExpandableTrieBuffer.isNearSizeLimit()
|| mTerminalPositionLookupTable.isNearSizeLimit()
|| mLanguageModelDictContent.isNearSizeLimit()
- || mBigramDictContent.isNearSizeLimit()
|| mShortcutDictContent.isNearSizeLimit();
}
@@ -89,14 +87,6 @@ class Ver4DictBuffers {
return &mLanguageModelDictContent;
}
- AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() {
- return &mBigramDictContent;
- }
-
- AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const {
- return &mBigramDictContent;
- }
-
AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() {
return &mShortcutDictContent;
}
@@ -122,8 +112,7 @@ class Ver4DictBuffers {
Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer,
MmappedBuffer::MmappedBufferPtr &&bodyBuffer,
const FormatUtils::FORMAT_VERSION formatVersion,
- const std::vector<uint8_t *> &contentBuffers,
- const std::vector<int> &contentBufferSizes);
+ const std::vector<ReadWriteByteArrayView> &contentBuffers);
Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize);
@@ -136,7 +125,6 @@ class Ver4DictBuffers {
BufferWithExtendableBuffer mExpandableTrieBuffer;
TerminalPositionLookupTable mTerminalPositionLookupTable;
LanguageModelDictContent mLanguageModelDictContent;
- BigramDictContent mBigramDictContent;
ShortcutDictContent mShortcutDictContent;
const int mIsUpdatable;
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp
index 93d4e562d..fd6907824 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
namespace latinime {
@@ -29,52 +29,44 @@ const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024;
// NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable.
// NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model.
-// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for bigram and shortcut.
+// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut.
const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE =
NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2
+ NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT
- + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT * 2;
+ + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT;
const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0;
const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX =
TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT;
const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX =
TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT;
-const int Ver4DictConstants::BIGRAM_BUFFERS_INDEX =
- LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT;
const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX =
- BIGRAM_BUFFERS_INDEX + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT;
+ LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT;
const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1;
const int Ver4DictConstants::PROBABILITY_SIZE = 1;
-const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1;
+const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1;
const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3;
const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0;
const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4;
const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4;
-const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1;
-const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1;
+const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0;
+const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2;
+
+const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1;
+const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2;
+const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4;
+const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8;
+const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10;
-const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16;
-const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64;
const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4;
-const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3;
-// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing
-// invalid terminal ID in bigram lists.
-const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID =
- (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1;
-const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1;
-const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F;
-const int Ver4DictConstants::BIGRAM_IS_LINK_MASK = 0x80;
-const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1;
-
const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1;
const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F;
const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80;
const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1;
const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3;
-const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 1;
+const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2;
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h
index 6950ca70f..13d7a5714 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h
@@ -20,6 +20,7 @@
#include "defines.h"
#include <cstddef>
+#include <cstdint>
namespace latinime {
@@ -41,27 +42,24 @@ class Ver4DictConstants {
static const int NOT_A_TERMINAL_ID;
static const int PROBABILITY_SIZE;
- static const int FLAGS_IN_PROBABILITY_FILE_SIZE;
+ static const int FLAGS_IN_LANGUAGE_MODEL_SIZE;
static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE;
static const int NOT_A_TERMINAL_ADDRESS;
static const int TERMINAL_ID_FIELD_SIZE;
static const int TIME_STAMP_FIELD_SIZE;
+ // TODO: Remove
static const int WORD_LEVEL_FIELD_SIZE;
static const int WORD_COUNT_FIELD_SIZE;
+ // Flags in probability entry.
+ static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE;
+ static const uint8_t FLAG_NOT_A_VALID_ENTRY;
+ static const uint8_t FLAG_NOT_A_WORD;
+ static const uint8_t FLAG_BLACKLISTED;
+ static const uint8_t FLAG_POSSIBLY_OFFENSIVE;
- static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE;
- static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE;
static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE;
static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE;
- static const int BIGRAM_FLAGS_FIELD_SIZE;
- static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
- static const int INVALID_BIGRAM_TARGET_TERMINAL_ID;
- static const int BIGRAM_IS_LINK_MASK;
- static const int BIGRAM_PROBABILITY_MASK;
- // Used when bigram list has time stamp.
- static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE;
-
static const int SHORTCUT_FLAGS_FIELD_SIZE;
static const int SHORTCUT_PROBABILITY_MASK;
static const int SHORTCUT_HAS_NEXT_MASK;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
index 731092efd..b38b03dcb 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp
@@ -14,15 +14,16 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/content/language_model_dict_content.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
@@ -50,26 +51,17 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
const int parentPos =
DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos);
int codePoints[MAX_WORD_LENGTH];
- const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
- dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos);
+ // Code point table is not used for ver4 dictionaries.
+ const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition(
+ dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos);
int terminalIdFieldPos = NOT_A_DICT_POS;
int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID;
- int probability = NOT_A_PROBABILITY;
if (PatriciaTrieReadingUtils::isTerminal(flags)) {
terminalIdFieldPos = pos;
if (usesAdditionalBuffer) {
terminalIdFieldPos += mBuffer->getOriginalBufferSize();
}
terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos);
- // TODO: Quit reading probability here.
- const ProbabilityEntry probabilityEntry =
- mLanguageModelDictContent->getProbabilityEntry(terminalId);
- if (probabilityEntry.hasHistoricalInfo()) {
- probability = ForgettingCurveUtils::decodeProbability(
- probabilityEntry.getHistoricalInfo(), mHeaderPolicy);
- } else {
- probability = probabilityEntry.getProbability();
- }
}
int childrenPosFieldPos = pos;
if (usesAdditionalBuffer) {
@@ -90,8 +82,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce
// The destination position is stored at the same place as the parent position.
return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos);
} else {
- return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints,
- terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos,
+ return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints,
+ terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos,
newSiblingNodePos);
}
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
index a91ad5728..4e5ae3a89 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h
@@ -18,8 +18,8 @@
#define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_reader.h"
namespace latinime {
@@ -29,15 +29,12 @@ class LanguageModelDictContent;
/*
* This class is used for helping to read nodes of ver4 patricia trie. This class handles moved
- * node and reads node attributes including probability form language model.
+ * node and reads node attributes.
*/
class Ver4PatriciaTrieNodeReader : public PtNodeReader {
public:
- Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer,
- const LanguageModelDictContent *const languageModelDictContent,
- const HeaderPolicy *const headerPolicy)
- : mBuffer(buffer), mLanguageModelDictContent(languageModelDictContent),
- mHeaderPolicy(headerPolicy) {}
+ explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer)
+ : mBuffer(buffer) {}
~Ver4PatriciaTrieNodeReader() {}
@@ -50,8 +47,6 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader {
DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader);
const BufferWithExtendableBuffer *const mBuffer;
- const LanguageModelDictContent *const mLanguageModelDictContent;
- const HeaderPolicy *const mHeaderPolicy;
const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos,
const int siblingNodePos) const;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
index 857222f5d..d974b50f4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp
@@ -14,20 +14,19 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
namespace latinime {
@@ -62,6 +61,7 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted(
}
}
+// TODO: Quit using bigramLinkedNodePos.
bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved(
const PtNodeParams *const toBeUpdatedPtNodeParams,
const int movedPos, const int bigramLinkedNodePos) {
@@ -142,13 +142,9 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty(
if (!toBeUpdatedPtNodeParams->isTerminal()) {
return false;
}
- const ProbabilityEntry originalProbabilityEntry =
- mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
- toBeUpdatedPtNodeParams->getTerminalId());
- const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry,
- unigramProperty);
+ const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
- toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry);
+ toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty);
}
bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC(
@@ -160,29 +156,15 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA
const ProbabilityEntry originalProbabilityEntry =
mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
toBeUpdatedPtNodeParams->getTerminalId());
- if (originalProbabilityEntry.hasHistoricalInfo()) {
- const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
- originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy);
- const ProbabilityEntry probabilityEntry =
- originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo);
- if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
- toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) {
- AKLOGE("Cannot write updated probability entry. terminalId: %d",
- toBeUpdatedPtNodeParams->getTerminalId());
- return false;
- }
- const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy);
- if (!isValid) {
- if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
- AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
- return false;
- }
- }
- *outNeedsToKeepPtNode = isValid;
- } else {
- // No need to update probability.
+ if (originalProbabilityEntry.isValid()) {
*outNeedsToKeepPtNode = true;
+ return true;
}
+ if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) {
+ AKLOGE("Cannot mark PtNode as willBecomeNonTerminal.");
+ return false;
+ }
+ *outNeedsToKeepPtNode = false;
return true;
}
@@ -205,7 +187,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition(
ptNodeWritingPos);
}
-
bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty,
int *const ptNodeWritingPos) {
@@ -216,31 +197,43 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition(
}
// Write probability.
ProbabilityEntry newProbabilityEntry;
- const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom(
- &newProbabilityEntry, unigramProperty);
+ const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty);
return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry(
- terminalId, &probabilityEntryToWrite);
+ terminalId, &probabilityEntryOfUnigramProperty);
}
+// TODO: Support counting ngram entries.
bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) {
- if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewBigram)) {
- AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d",
- prevWordIds[0], wordId);
+ const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) {
+ LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getMutableLanguageModelDictContent();
+ const ProbabilityEntry probabilityEntry =
+ languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId);
+ const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty);
+ if (!languageModelDictContent->setNgramProbabilityEntry(
+ prevWordIds, wordId, &probabilityEntryOfNgramProperty)) {
+ AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d",
+ prevWordIds[0], prevWordIds.size(), wordId);
return false;
}
+ if (!probabilityEntry.isValid() && outAddedNewBigram) {
+ *outAddedNewBigram = true;
+ }
return true;
}
bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds,
const int wordId) {
- return mBigramPolicy->removeEntry(prevWordIds[0], wordId);
+ LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getMutableLanguageModelDictContent();
+ return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId);
}
+// TODO: Remove when we stop supporting v402 format.
bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries(
const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) {
- return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(
- sourcePtNodeParams->getTerminalId(), outBigramEntryCount);
+ // Do nothing.
+ return true;
}
bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
@@ -275,12 +268,6 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields(
if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) {
return false;
}
-
- // Counts bigram entries.
- if (outBigramEntryCount) {
- *outBigramEntryCount = mBigramPolicy->getBigramEntryConut(
- toBeUpdatedPtNodeParams->getTerminalId());
- }
return true;
}
@@ -289,7 +276,7 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN
const int shortcutProbability) {
if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(),
targetCodePoints, targetCodePointCount, shortcutProbability)) {
- AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId());
+ AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId());
return false;
}
return true;
@@ -346,37 +333,17 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition(
ptNodeParams->getChildrenPos(), ptNodeWritingPos)) {
return false;
}
- return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(),
- isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
-}
-
-const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom(
- const ProbabilityEntry *const originalProbabilityEntry,
- const UnigramProperty *const unigramProperty) const {
- // TODO: Consolidate historical info and probability.
- if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
- const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(),
- unigramProperty->getLevel(), unigramProperty->getCount());
- const HistoricalInfo updatedHistoricalInfo =
- ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalProbabilityEntry->getHistoricalInfo(),
- unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy);
- return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo(
- &updatedHistoricalInfo);
- } else {
- return originalProbabilityEntry->createEntryWithUpdatedProbability(
- unigramProperty->getProbability());
- }
+ return updatePtNodeFlags(nodePos, isTerminal,
+ ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */);
}
-bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos,
- const bool isBlacklisted, const bool isNotAWord, const bool isTerminal,
+bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal,
const bool hasMultipleChars) {
// Create node flags and write them.
PatriciaTrieReadingUtils::NodeFlags nodeFlags =
- PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal,
- false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars,
- CHILDREN_POSITION_FIELD_SIZE);
+ PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */,
+ false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */,
+ false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE);
if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) {
AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos);
return false;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
index 6703dba04..55856110b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h
@@ -18,16 +18,15 @@
#define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/pt_common/pt_node_params.h"
+#include "dictionary/structure/pt_common/pt_node_writer.h"
+#include "dictionary/structure/v4/content/probability_entry.h"
namespace latinime {
class BufferWithExtendableBuffer;
class HeaderPolicy;
-class Ver4BigramListPolicy;
class Ver4DictBuffers;
class Ver4PatriciaTrieNodeReader;
class Ver4PtNodeArrayReader;
@@ -39,13 +38,11 @@ class Ver4ShortcutListPolicy;
class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
public:
Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer,
- Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy,
- const PtNodeReader *const ptNodeReader,
+ Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader,
const PtNodeArrayReader *const ptNodeArrayReader,
- Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy)
- : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy),
- mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy),
- mShortcutPolicy(shortcutPolicy) {}
+ Ver4ShortcutListPolicy *const shortcutPolicy)
+ : mTrieBuffer(trieBuffer), mBuffers(buffers),
+ mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {}
virtual ~Ver4PatriciaTrieNodeWriter() {}
@@ -76,7 +73,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos);
virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
+ const NgramProperty *const ngramProperty, bool *const outAddedNewEntry);
virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId);
@@ -98,23 +95,13 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter {
const PtNodeParams *const ptNodeParams, int *const outTerminalId,
int *const ptNodeWritingPos);
- // Create updated probability entry using given unigram property. In addition to the
- // probability, this method updates historical information if needed.
- // TODO: Update flags belonging to the unigram property.
- const ProbabilityEntry createUpdatedEntryFrom(
- const ProbabilityEntry *const originalProbabilityEntry,
- const UnigramProperty *const unigramProperty) const;
-
- bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord,
- const bool isTerminal, const bool hasMultipleChars);
+ bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars);
static const int CHILDREN_POSITION_FIELD_SIZE;
BufferWithExtendableBuffer *const mTrieBuffer;
Ver4DictBuffers *const mBuffers;
- const HeaderPolicy *const mHeaderPolicy;
DynamicPtReadingHelper mReadingHelper;
- Ver4BigramListPolicy *const mBigramPolicy;
Ver4ShortcutListPolicy *const mShortcutPolicy;
};
} // namespace latinime
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
new file mode 100644
index 000000000..1dbec5545
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -0,0 +1,603 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_policy.h"
+
+#include <array>
+#include <vector>
+
+#include "suggest/core/dicnode/dic_node.h"
+#include "suggest/core/dicnode/dic_node_vector.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/property/ngram_property.h"
+#include "dictionary/property/unigram_property.h"
+#include "dictionary/property/word_property.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/utils/multi_bigram_map.h"
+#include "dictionary/utils/probability_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
+// BinaryDictionaryDecayingTests.
+const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
+const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
+const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
+const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
+ Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
+
+void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
+ DicNodeVector *const childDicNodes) const {
+ if (!dicNode->hasChildren()) {
+ return;
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
+ while (!readingHelper.isEnd()) {
+ const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
+ if (!ptNodeParams.isValid()) {
+ break;
+ }
+ const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
+ const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID;
+ childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(),
+ wordId, ptNodeParams.getCodePointArrayView());
+ readingHelper.readNextSiblingNode(ptNodeParams);
+ }
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+}
+
+int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId,
+ const int maxCodePointCount, int *const outCodePoints) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ readingHelper.initWithPtNodePos(ptNodePos);
+ const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount(
+ maxCodePointCount, outCodePoints);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
+ }
+ return codePointCount;
+}
+
+int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints,
+ const bool forceLowerCaseSearch) const {
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(),
+ wordCodePoints.size(), forceLowerCaseSearch);
+ if (readingHelper.isError()) {
+ mIsCorrupted = true;
+ AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
+ }
+ if (ptNodePos == NOT_A_DICT_POS) {
+ return NOT_A_WORD_ID;
+ }
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_WORD_ID;
+ }
+ return ptNodeParams.getTerminalId();
+}
+
+const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext(
+ const WordIdArrayView prevWordIds, const int wordId,
+ MultiBigramMap *const multiBigramMap) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return WordAttributes();
+ }
+ return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
+ false /* mustMatchAllPrevWords */, mHeaderPolicy);
+}
+
+int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds,
+ const int wordId) const {
+ if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) {
+ return NOT_A_PROBABILITY;
+ }
+ const WordAttributes wordAttributes =
+ mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId,
+ true /* mustMatchAllPrevWords */, mHeaderPolicy);
+ if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) {
+ return NOT_A_PROBABILITY;
+ }
+ return wordAttributes.getProbability();
+}
+
+BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator(
+ const int wordId) const {
+ const int shortcutPos = getShortcutPositionOfWord(wordId);
+ return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos);
+}
+
+void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds,
+ NgramListener *const listener) const {
+ if (prevWordIds.empty()) {
+ return;
+ }
+ const auto languageModelDictContent = mBuffers->getLanguageModelDictContent();
+ for (size_t i = 1; i <= prevWordIds.size(); ++i) {
+ for (const auto entry : languageModelDictContent->getProbabilityEntries(
+ prevWordIds.limit(i))) {
+ const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry();
+ if (!probabilityEntry.isValid()) {
+ continue;
+ }
+ int probability = NOT_A_PROBABILITY;
+ if (probabilityEntry.hasHistoricalInfo()) {
+ // TODO: Quit checking count here.
+ // If count <= 1, the word can be an invaild word. The actual probability should
+ // be checked using getWordAttributesInContext() in onVisitEntry().
+ probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ?
+ NOT_A_PROBABILITY : 0;
+ } else {
+ probability = probabilityEntry.getProbability();
+ }
+ listener->onVisitEntry(probability, entry.getWordId());
+ }
+ }
+}
+
+int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const {
+ if (wordId == NOT_A_WORD_ID) {
+ return NOT_A_DICT_POS;
+ }
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
+ if (ptNodeParams.isDeleted()) {
+ return NOT_A_DICT_POS;
+ }
+ return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
+ ptNodeParams.getTerminalId());
+}
+
+bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints,
+ const UnigramProperty *const unigramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert to the dictionary, length: %zd",
+ wordCodePoints.size());
+ return false;
+ }
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd",
+ shortcut.getTargetCodePoints()->size());
+ return false;
+ }
+ }
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ bool addedNewUnigram = false;
+ int codePointsToAdd[MAX_WORD_LENGTH];
+ int codePointCountToAdd = wordCodePoints.size();
+ memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd);
+ if (unigramProperty->representsBeginningOfSentence()) {
+ codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
+ codePointCountToAdd, MAX_WORD_LENGTH);
+ }
+ if (codePointCountToAdd <= 0) {
+ return false;
+ }
+ const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd);
+ if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty,
+ &addedNewUnigram)) {
+ if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
+ mEntryCounters.incrementNgramCount(NgramType::Unigram);
+ }
+ if (unigramProperty->getShortcuts().size() > 0) {
+ // Add shortcut target.
+ const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ AKLOGE("Cannot find word id to add shortcut target.");
+ return false;
+ }
+ const int wordPos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ for (const auto &shortcut : unigramProperty->getShortcuts()) {
+ if (!mUpdatingHelper.addShortcutTarget(wordPos,
+ CodePointArrayView(*shortcut.getTargetCodePoints()),
+ shortcut.getProbability())) {
+ AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, "
+ "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
+ shortcut.getProbability());
+ return false;
+ }
+ }
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ const int ptNodePos =
+ mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
+ const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
+ if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
+ AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
+ return false;
+ }
+ if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) {
+ return false;
+ }
+ if (!ptNodeParams.representsNonWordInfo()) {
+ mEntryCounters.decrementNgramCount(NgramType::Unigram);
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ const NgramContext *const ngramContext = ngramProperty->getNgramContext();
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary.");
+ return false;
+ }
+ if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
+ AKLOGE("The word is too long to insert the ngram to the dictionary. "
+ "length: %zd", ngramProperty->getTargetCodePoints()->size());
+ return false;
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSearch */);
+ if (prevWordIds.empty()) {
+ return false;
+ }
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ if (prevWordIds[i] != NOT_A_WORD_ID) {
+ continue;
+ }
+ if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) {
+ return false;
+ }
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */, true /* isNotAWord */,
+ false /* isBlacklisted */, false /* isPossiblyOffensive */,
+ MAX_PROBABILITY /* probability */, HistoricalInfo());
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ }
+ const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()),
+ false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ bool addedNewEntry = false;
+ if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) {
+ if (addedNewEntry) {
+ mEntryCounters.incrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
+ AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
+ mDictBuffer->getTailPosition());
+ return false;
+ }
+ if (!ngramContext->isValid()) {
+ AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary.");
+ return false;
+ }
+ if (wordCodePoints.size() > MAX_WORD_LENGTH) {
+ AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd",
+ wordCodePoints.size());
+ }
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSerch */);
+ if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) {
+ return false;
+ }
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ return false;
+ }
+ if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) {
+ mEntryCounters.decrementNgramCount(
+ NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1));
+ return true;
+ } else {
+ return false;
+ }
+}
+
+bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext(
+ const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints,
+ const bool isValidWord, const HistoricalInfo historicalInfo) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable "
+ "dictionary.");
+ return false;
+ }
+ const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ?
+ false : isValidWord;
+ int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ // The word is not in the dictionary.
+ const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */,
+ false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */,
+ NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */,
+ 0 /* count */));
+ if (!addUnigramEntry(wordCodePoints, &unigramProperty)) {
+ AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ if (!isValidWord) {
+ return true;
+ }
+ wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */);
+ }
+
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray,
+ false /* tryLowerCaseSearch */);
+ if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) {
+ if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) {
+ const UnigramProperty beginningOfSentenceUnigramProperty(
+ true /* representsBeginningOfSentence */,
+ true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY,
+ HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */));
+ if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */),
+ &beginningOfSentenceUnigramProperty)) {
+ AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext().");
+ return false;
+ }
+ // Refresh word ids.
+ ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */);
+ }
+ // Update entries for beginning of sentence.
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(
+ prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo,
+ mHeaderPolicy, &mEntryCounters)) {
+ return false;
+ }
+ }
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds,
+ wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) {
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) {
+ AKLOGE("Cannot flush the dictionary to file.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
+ AKLOGE("Cannot flush the dictionary to file with GC.");
+ mIsCorrupted = true;
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
+ if (!mBuffers->isUpdatable()) {
+ AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
+ return false;
+ }
+ if (mBuffers->isNearSizeLimit()) {
+ // Additional buffer size is near the limit.
+ return true;
+ } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
+ > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
+ // Total extended region size of the trie exceeds the limit.
+ return true;
+ } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
+ && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
+ // Needs to reduce dictionary size.
+ return true;
+ } else if (mHeaderPolicy->isDecayingDict()) {
+ return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(),
+ mHeaderPolicy);
+ }
+ return false;
+}
+
+void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
+ char *const outResult, const int maxResultLength) {
+ const int compareLength = queryLength + 1 /* terminator */;
+ if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mEntryCounters.getNgramCount(NgramType::Unigram));
+ } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram));
+ } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Unigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
+ snprintf(outResult, maxResultLength, "%d",
+ mHeaderPolicy->isDecayingDict() ?
+ ForgettingCurveUtils::getEntryCountHardLimit(
+ mHeaderPolicy->getMaxNgramCounts().getNgramCount(
+ NgramType::Bigram)) :
+ static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ }
+}
+
+const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
+ const CodePointArrayView wordCodePoints) const {
+ const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) {
+ AKLOGE("getWordProperty is called for invalid word.");
+ return WordProperty();
+ }
+ const LanguageModelDictContent *const languageModelDictContent =
+ mBuffers->getLanguageModelDictContent();
+ // Fetch ngram information.
+ std::vector<NgramProperty> ngrams;
+ int ngramTargetCodePoints[MAX_WORD_LENGTH];
+ int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
+ int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord(
+ mHeaderPolicy, wordId)) {
+ const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(),
+ MAX_WORD_LENGTH, ngramTargetCodePoints);
+ const WordIdArrayView prevWordIds = entry.getPrevWordIds();
+ for (size_t i = 0; i < prevWordIds.size(); ++i) {
+ ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i],
+ MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]);
+ ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry(
+ prevWordIds[i]).representsBeginningOfSentence();
+ if (ngramPrevWordIsBeginningOfSentense[i]) {
+ ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker(
+ ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]);
+ }
+ }
+ const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount,
+ ngramPrevWordIsBeginningOfSentense, prevWordIds.size());
+ const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry();
+ const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo();
+ // TODO: Output flags in WordAttributes.
+ ngrams.emplace_back(ngramContext,
+ CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(),
+ entry.getWordAttributes().getProbability(), *historicalInfo);
+ }
+ // Fetch shortcut information.
+ std::vector<UnigramProperty::ShortcutProperty> shortcuts;
+ int shortcutPos = getShortcutPositionOfWord(wordId);
+ if (shortcutPos != NOT_A_DICT_POS) {
+ int shortcutTarget[MAX_WORD_LENGTH];
+ const ShortcutDictContent *const shortcutDictContent =
+ mBuffers->getShortcutDictContent();
+ bool hasNext = true;
+ while (hasNext) {
+ int shortcutTargetLength = 0;
+ int shortcutProbability = NOT_A_PROBABILITY;
+ shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
+ &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
+ shortcuts.emplace_back(
+ CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(),
+ shortcutProbability);
+ }
+ }
+ const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
+ WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy);
+ const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
+ const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
+ const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
+ wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
+ wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
+ *historicalInfo, std::move(shortcuts));
+ return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams);
+}
+
+int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
+ int *const outCodePointCount) {
+ *outCodePointCount = 0;
+ if (token == 0) {
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
+ &mTerminalPtNodePositionsForIteratingWords);
+ DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(getRootPosition());
+ readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
+ }
+ const int terminalPtNodePositionsVectorSize =
+ static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
+ if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
+ AKLOGE("Given token %d is invalid.", token);
+ return 0;
+ }
+ const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
+ const PtNodeParams ptNodeParams =
+ mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos);
+ *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(),
+ MAX_WORD_LENGTH, outCodePoints);
+ const int nextToken = token + 1;
+ if (nextToken >= terminalPtNodePositionsVectorSize) {
+ // All words have been iterated.
+ mTerminalPtNodePositionsForIteratingWords.clear();
+ return 0;
+ }
+ return nextToken;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h
index faad4290d..d130a4e78 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h
@@ -20,40 +20,38 @@
#include <vector>
#include "defines.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
-#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "utils/int_array_view.h"
namespace latinime {
class DicNode;
class DicNodeVector;
+// Word id = Artificial id that is stored in the PtNode looked up by the word.
class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
public:
Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers)
: mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()),
mDictBuffer(mBuffers->getWritableTrieBuffer()),
- mBigramPolicy(mBuffers->getMutableBigramDictContent(),
- mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy),
mShortcutPolicy(mBuffers->getMutableShortcutDictContent(),
mBuffers->getTerminalPositionLookupTable()),
- mNodeReader(mDictBuffer, mBuffers->getLanguageModelDictContent(), mHeaderPolicy),
- mPtNodeArrayReader(mDictBuffer),
- mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader,
- &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy),
+ mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer),
+ mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader,
+ &mShortcutPolicy),
mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter),
mWritingHelper(mBuffers.get()),
- mUnigramCount(mHeaderPolicy->getUnigramCount()),
- mBigramCount(mHeaderPolicy->getBigramCount()),
+ mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()),
mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {};
AK_FORCE_INLINE int getRootPosition() const {
@@ -63,40 +61,44 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
void createAndGetAllChildDicNodes(const DicNode *const dicNode,
DicNodeVector *const childDicNodes) const;
- int getCodePointsAndProbabilityAndReturnCodePointCount(
- const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const;
+ int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount,
+ int *const outCodePoints) const;
- int getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const;
+ int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const;
- int getProbability(const int unigramProbability, const int bigramProbability) const;
+ const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds,
+ const int wordId, MultiBigramMap *const multiBigramMap) const;
- int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const;
+ // TODO: Remove
+ int getProbability(const int unigramProbability, const int bigramProbability) const {
+ // Not used.
+ return NOT_A_PROBABILITY;
+ }
+
+ int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const;
- void iterateNgramEntries(const int *const prevWordsPtNodePos,
+ void iterateNgramEntries(const WordIdArrayView prevWordIds,
NgramListener *const listener) const;
- int getShortcutPositionOfPtNode(const int ptNodePos) const;
+ BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const;
const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const {
return mHeaderPolicy;
}
- const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const {
- return &mShortcutPolicy;
- }
-
- bool addUnigramEntry(const int *const word, const int length,
+ bool addUnigramEntry(const CodePointArrayView wordCodePoints,
const UnigramProperty *const unigramProperty);
- bool removeUnigramEntry(const int *const word, const int length);
+ bool removeUnigramEntry(const CodePointArrayView wordCodePoints);
+
+ bool addNgramEntry(const NgramProperty *const ngramProperty);
- bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty);
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints);
- bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1,
- const int length1);
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView wordCodePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo);
bool flush(const char *const filePath);
@@ -107,8 +109,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
void getProperty(const char *const query, const int queryLength, char *const outResult,
const int maxResultLength);
- const WordProperty getWordProperty(const int *const codePoints,
- const int codePointCount) const;
+ const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const;
int getNextWordAndNextToken(const int token, int *const outCodePoints,
int *const outCodePointCount);
@@ -132,19 +133,17 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy {
const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers;
const HeaderPolicy *const mHeaderPolicy;
BufferWithExtendableBuffer *const mDictBuffer;
- Ver4BigramListPolicy mBigramPolicy;
Ver4ShortcutListPolicy mShortcutPolicy;
Ver4PatriciaTrieNodeReader mNodeReader;
Ver4PtNodeArrayReader mPtNodeArrayReader;
Ver4PatriciaTrieNodeWriter mNodeWriter;
DynamicPtUpdatingHelper mUpdatingHelper;
Ver4PatriciaTrieWritingHelper mWritingHelper;
- int mUnigramCount;
- int mBigramCount;
+ MutableEntryCounters mEntryCounters;
std::vector<int> mTerminalPtNodePositionsForIteratingWords;
mutable bool mIsCorrupted;
- int getBigramsPositionOfPtNode(const int ptNodePos) const;
+ int getShortcutPositionOfWord(const int wordId) const;
};
} // namespace latinime
#endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
index 254022db4..ccb70cdd3 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
index 466ff55d5..466ff55d5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h
diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
new file mode 100644
index 000000000..6dfdf4d31
--- /dev/null
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2013, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
+
+#include <cstring>
+#include <queue>
+
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/structure/v4/ver4_dict_constants.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
+#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
+ const EntryCounts &entryCounts) const {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
+ + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
+ entryCounts, extendedRegionSize, &headerBuffer)) {
+ AKLOGE("Cannot write header structure to buffer. "
+ "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d,"
+ "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram),
+ entryCounts.getNgramCount(NgramType::Bigram),
+ entryCounts.getNgramCount(NgramType::Trigram),
+ extendedRegionSize);
+ return false;
+ }
+ return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
+ const char *const dictDirPath) {
+ const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
+ Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
+ Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
+ Ver4DictConstants::MAX_DICTIONARY_SIZE));
+ MutableEntryCounters entryCounters;
+ if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) {
+ return false;
+ }
+ BufferWithExtendableBuffer headerBuffer(
+ BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
+ if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
+ entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) {
+ return false;
+ }
+ return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
+}
+
+bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
+ const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
+ MutableEntryCounters *const outEntryCounters) {
+ Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer());
+ Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
+ Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
+ mBuffers->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
+ mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
+
+ if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC(
+ headerPolicy, outEntryCounters)) {
+ AKLOGE("Failed to update probabilities in language model dict content.");
+ return false;
+ }
+ if (headerPolicy->isDecayingDict()) {
+ const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts();
+ if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries(
+ outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy,
+ outEntryCounters)) {
+ AKLOGE("Failed to truncate entries in language model dict content.");
+ return false;
+ }
+ }
+
+ DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners
+ ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
+ traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
+ &ptNodeWriter);
+ if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
+ return false;
+ }
+
+ // Mapping from positions in mBuffer to positions in bufferToWrite.
+ PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
+ readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy);
+ DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
+ traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
+ buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
+ if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
+ return false;
+ }
+
+ // Create policy instances for the GCed dictionary.
+ Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer());
+ Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
+ Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
+ buffersToWrite->getTerminalPositionLookupTable());
+ Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
+ buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader,
+ &newShortcutPolicy);
+ // Re-assign terminal IDs for valid terminal PtNodes.
+ TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
+ if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
+ &terminalIdMap)) {
+ return false;
+ }
+ // Run GC for language model dict content.
+ if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
+ mBuffers->getLanguageModelDictContent())) {
+ return false;
+ }
+ // Run GC for shortcut dict content.
+ if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
+ mBuffers->getShortcutDictContent())) {
+ return false;
+ }
+ DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
+ traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
+ &traversePolicyToUpdateAllPositionFields)) {
+ return false;
+ }
+ newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
+ TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
+ if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
+ &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
+ return false;
+ }
+ return true;
+}
+
+bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
+ ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
+ if (!ptNodeParams->isTerminal()) {
+ return true;
+ }
+ TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
+ mTerminalIdMap->find(ptNodeParams->getTerminalId());
+ if (it == mTerminalIdMap->end()) {
+ AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
+ ptNodeParams->getTerminalId(), mTerminalIdMap->size());
+ return false;
+ }
+ if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
+ AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
+ return false;
+ }
+ return true;
+}
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
index bb464ad28..68dd1caa2 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h
@@ -18,8 +18,9 @@
#define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h"
+#include "dictionary/structure/v4/content/terminal_position_lookup_table.h"
+#include "dictionary/utils/entry_counters.h"
namespace latinime {
@@ -33,8 +34,7 @@ class Ver4PatriciaTrieWritingHelper {
Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers)
: mBuffers(buffers) {}
- bool writeToDictFile(const char *const dictDirPath, const int unigramCount,
- const int bigramCount) const;
+ bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const;
// This method cannot be const because the original dictionary buffer will be updated to detect
// useless PtNodes during GC.
@@ -66,57 +66,8 @@ class Ver4PatriciaTrieWritingHelper {
const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap;
};
- // For truncateUnigrams() and truncateBigrams().
- class DictProbability {
- public:
- DictProbability(const int dictPos, const int probability, const int timestamp)
- : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {}
-
- int getDictPos() const {
- return mDictPos;
- }
-
- int getProbability() const {
- return mProbability;
- }
-
- int getTimestamp() const {
- return mTimestamp;
- }
-
- private:
- DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability);
-
- int mDictPos;
- int mProbability;
- int mTimestamp;
- };
-
- // For truncateUnigrams() and truncateBigrams().
- class DictProbabilityComparator {
- public:
- bool operator()(const DictProbability &left, const DictProbability &right) {
- if (left.getProbability() != right.getProbability()) {
- return left.getProbability() > right.getProbability();
- }
- if (left.getTimestamp() != right.getTimestamp()) {
- return left.getTimestamp() < right.getTimestamp();
- }
- return left.getDictPos() > right.getDictPos();
- }
-
- private:
- DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator);
- };
-
bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy,
- Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount,
- int *const outBigramCount);
-
- bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader,
- Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount);
-
- bool truncateBigrams(const int maxBigramCount);
+ Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters);
Ver4DictBuffers *const mBuffers;
};
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
index b014c523d..63d0b4ad5 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
+++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp
@@ -14,11 +14,11 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
+#include "dictionary/structure/v4/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h"
+#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h
index d81808efc..ccb760bc1 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h
+++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h
@@ -18,7 +18,7 @@
#define LATINIME_VER4_PT_NODE_ARRAY_READER_H
#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h"
+#include "dictionary/structure/pt_common/pt_node_array_reader.h"
namespace latinime {
diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h
index 178b06554..8a614730b 100644
--- a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h
+++ b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h
@@ -18,7 +18,7 @@
#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H
#include "defines.h"
-#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
+#include "dictionary/interface/dictionary_bigrams_structure_policy.h"
namespace latinime {
diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h
index 558e0a5c3..a4ddd58c2 100644
--- a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h
+++ b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h
@@ -18,7 +18,7 @@
#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H
#include "defines.h"
-#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h"
+#include "dictionary/interface/dictionary_shortcuts_structure_policy.h"
namespace latinime {
@@ -31,6 +31,11 @@ class BinaryDictionaryShortcutIterator {
mPos(shortcutStructurePolicy->getStartPos(shortcutPos)),
mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {}
+ BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator)
+ : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy),
+ mPos(shortcutIterator.mPos),
+ mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {}
+
AK_FORCE_INLINE bool hasNextShortcutTarget() const {
return mHasNextShortcutTarget;
}
@@ -45,7 +50,8 @@ class BinaryDictionaryShortcutIterator {
}
private:
- DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryShortcutIterator);
+ DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator);
+ DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator);
const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy;
int mPos;
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/dictionary/utils/bloom_filter.h
index 1e60f49ed..1e60f49ed 100644
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.h
+++ b/native/jni/src/dictionary/utils/bloom_filter.h
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp
index 833063c17..217569651 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp
+++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
@@ -31,7 +31,7 @@ uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) con
uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size,
int *const pos) const {
- const int value = readUint(size, *pos);
+ const uint32_t value = readUint(size, *pos);
*pos += size;
return value;
}
@@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC
if (readingPosIsInAdditionalBuffer) {
*pos -= mOriginalBuffer.size();
}
+ // Code point table is not used for dynamic format.
*outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition(
- getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos);
+ getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount,
+ nullptr /* codePointTable */, outCodePoints, pos);
if (readingPosIsInAdditionalBuffer) {
*pos += mOriginalBuffer.size();
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h
index fad83aa25..0a141d4db 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h
+++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h
@@ -22,7 +22,7 @@
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
#include "utils/byte_array_view.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/dictionary/utils/byte_array_utils.cpp
index 1833e8832..d38f08217 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp
+++ b/native/jni/src/dictionary/utils/byte_array_utils.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h
index c0a9fcb1d..abb979050 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h
+++ b/native/jni/src/dictionary/utils/byte_array_utils.h
@@ -114,7 +114,7 @@ class ByteArrayUtils {
return buffer[(*pos)++];
}
- static AK_FORCE_INLINE int readUint(const uint8_t *const buffer,
+ static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer,
const int size, const int pos) {
// size must be in 1 to 4.
ASSERT(size >= 1 && size <= 4);
@@ -147,11 +147,18 @@ class ByteArrayUtils {
*/
static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) {
int p = pos;
- return readCodePointAndAdvancePosition(buffer, &p);
+ return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p);
}
static AK_FORCE_INLINE int readCodePointAndAdvancePosition(
- const uint8_t *const buffer, int *const pos) {
+ const uint8_t *const buffer, const int *const codePointTable, int *const pos) {
+ /*
+ * codePointTable is an array to convert the most frequent characters in this dictionary to
+ * 1 byte code points. It is only made of the original code points of the most frequent
+ * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters.
+ * The original code points are restored by picking the code points at the indices of the
+ * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte.
+ */
const uint8_t firstByte = readUint8(buffer, *pos);
if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) {
if (firstByte == CHARACTER_ARRAY_TERMINATOR) {
@@ -162,6 +169,9 @@ class ByteArrayUtils {
}
} else {
*pos += 1;
+ if (codePointTable) {
+ return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE];
+ }
return firstByte;
}
}
@@ -173,12 +183,13 @@ class ByteArrayUtils {
*/
// Returns the length of the string.
static int readStringAndAdvancePosition(const uint8_t *const buffer,
- const int maxLength, int *const outBuffer, int *const pos) {
+ const int maxLength, const int *const codePointTable, int *const outBuffer,
+ int *const pos) {
int length = 0;
- int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
outBuffer[length++] = codePoint;
- codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos);
}
return length;
}
@@ -187,9 +198,9 @@ class ByteArrayUtils {
static int advancePositionToBehindString(
const uint8_t *const buffer, const int maxLength, int *const pos) {
int length = 0;
- int codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
while (NOT_A_CODE_POINT != codePoint && length < maxLength) {
- codePoint = readCodePointAndAdvancePosition(buffer, pos);
+ codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos);
length++;
}
return length;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp
index b7e2a7278..033a758ba 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp
+++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
#include <cstdio>
#include <errno.h>
@@ -22,13 +22,14 @@
#include <sys/stat.h>
#include <sys/types.h>
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/structure/backward/v402/ver4_dict_buffers.h"
+#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h"
+#include "dictionary/structure/v4/ver4_dict_buffers.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/entry_counters.h"
+#include "dictionary/utils/file_utils.h"
+#include "dictionary/utils/format_utils.h"
#include "utils/time_keeper.h"
namespace latinime {
@@ -43,13 +44,13 @@ const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4;
TimeKeeper::setCurrentTime();
const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion);
switch (formatVersion) {
- case FormatUtils::VERSION_4:
+ case FormatUtils::VERSION_402:
return createEmptyV4DictFile<backward::v402::Ver4DictConstants,
backward::v402::Ver4DictBuffers,
backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>(
filePath, localeAsCodePointVector, attributeMap, formatVersion);
case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
- case FormatUtils::VERSION_4_DEV:
+ case FormatUtils::VERSION_403:
return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers,
Ver4DictBuffers::Ver4DictBuffersPtr>(
filePath, localeAsCodePointVector, attributeMap, formatVersion);
@@ -69,8 +70,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr>
DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy,
DictConstants::MAX_DICT_EXTENDED_REGION_SIZE);
headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
- 0 /* unigramCount */, 0 /* bigramCount */,
- 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer());
+ EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer());
if (!DynamicPtWritingUtils::writeEmptyDictionary(
dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) {
AKLOGE("Empty ver4 dictionary structure cannot be created on memory.");
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/dictionary/utils/dict_file_writing_utils.h
index 4843b3b32..102a89da4 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h
+++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.h
@@ -20,8 +20,8 @@
#include <cstdio>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/utils/format_utils.h"
namespace latinime {
diff --git a/native/jni/src/dictionary/utils/entry_counters.h b/native/jni/src/dictionary/utils/entry_counters.h
new file mode 100644
index 000000000..5e443026e
--- /dev/null
+++ b/native/jni/src/dictionary/utils/entry_counters.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_ENTRY_COUNTERS_H
+#define LATINIME_ENTRY_COUNTERS_H
+
+#include <array>
+
+#include "defines.h"
+#include "utils/ngram_utils.h"
+
+namespace latinime {
+
+// Copyable but immutable
+class EntryCounts final {
+ public:
+ EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {}
+
+ explicit EntryCounts(const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters)
+ : mEntryCounts(counters) {}
+
+ int getNgramCount(const NgramType ngramType) const {
+ return mEntryCounts[static_cast<int>(ngramType)];
+ }
+
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &getCountArray() const {
+ return mEntryCounts;
+ }
+
+ private:
+ DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts);
+
+ // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram
+ // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element)
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounts;
+};
+
+class MutableEntryCounters final {
+ public:
+ MutableEntryCounters() {
+ mEntryCounters.fill(0);
+ }
+
+ explicit MutableEntryCounters(
+ const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters)
+ : mEntryCounters(counters) {}
+
+ const EntryCounts getEntryCounts() const {
+ return EntryCounts(mEntryCounters);
+ }
+
+ void incrementNgramCount(const NgramType ngramType) {
+ ++mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ void decrementNgramCount(const NgramType ngramType) {
+ --mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ int getNgramCount(const NgramType ngramType) const {
+ return mEntryCounters[static_cast<int>(ngramType)];
+ }
+
+ void setNgramCount(const NgramType ngramType, const int count) {
+ mEntryCounters[static_cast<int>(ngramType)] = count;
+ }
+
+ private:
+ DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters);
+
+ // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram
+ // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element)
+ std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounters;
+};
+} // namespace latinime
+#endif /* LATINIME_ENTRY_COUNTERS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp b/native/jni/src/dictionary/utils/file_utils.cpp
index fb80f38c5..bb392fb32 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp
+++ b/native/jni/src/dictionary/utils/file_utils.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
+#include "dictionary/utils/file_utils.h"
#include <cstdio>
#include <cstring>
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h b/native/jni/src/dictionary/utils/file_utils.h
index 4f1b93a6a..4f1b93a6a 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h
+++ b/native/jni/src/dictionary/utils/file_utils.h
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp
index fed0ae77e..d79ed911b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp
+++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp
@@ -14,14 +14,14 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
+#include "dictionary/utils/forgetting_curve_utils.h"
#include <algorithm>
#include <cmath>
#include <stdlib.h>
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
+#include "dictionary/header/header_policy.h"
+#include "dictionary/utils/probability_utils.h"
#include "utils/time_keeper.h"
namespace latinime {
@@ -29,13 +29,16 @@ namespace latinime {
const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8;
const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60;
-const int ForgettingCurveUtils::MAX_LEVEL = 3;
-const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 1;
-const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15;
-const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14;
+const int ForgettingCurveUtils::MAX_LEVEL = 15;
+const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2;
+const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31;
+const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30;
+const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1;
+// TODO: Evaluate whether this should be 7.5 days.
+// 15 days
+const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60;
-const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
-const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2;
+const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2;
const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable;
@@ -43,7 +46,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo(
const HistoricalInfo *const originalHistoricalInfo, const int newProbability,
const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) {
- const int timestamp = newHistoricalInfo->getTimeStamp();
+ const int timestamp = newHistoricalInfo->getTimestamp();
if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) {
// Add entry as a valid word.
const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel());
@@ -54,19 +57,23 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
|| (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel()
&& originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) {
// Initial information.
+ int count = newHistoricalInfo->getCount();
+ if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
+ const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1);
+ return HistoricalInfo(timestamp, level, 0 /* count */);
+ }
const int level = clampToValidLevelRange(newHistoricalInfo->getLevel());
- const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy);
- return HistoricalInfo(timestamp, level, count);
+ return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy));
} else {
const int updatedCount = originalHistoricalInfo->getCount() + 1;
- if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) {
+ if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) {
// The count exceeds the max value the level can be incremented.
if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) {
// The level is already max.
return HistoricalInfo(timestamp,
originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount());
} else {
- // Level up.
+ // Raise the level.
return HistoricalInfo(timestamp,
originalHistoricalInfo->getLevel() + 1, 0 /* count */);
}
@@ -78,66 +85,54 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ int ForgettingCurveUtils::decodeProbability(
const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) {
- const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
- headerPolicy->getForgettingCurveDurationToLevelDown());
+ const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(),
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS);
return sProbabilityTable.getProbability(
headerPolicy->getForgettingCurveProbabilityValuesTableId(),
clampToValidLevelRange(historicalInfo->getLevel()),
clampToValidTimeStepCountRange(elapsedTimeStepCount));
}
-/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability,
- const int bigramProbability) {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return std::min(backoff(unigramProbability), MAX_PROBABILITY);
- } else {
- // TODO: Investigate better way to handle bigram probability.
- return std::min(std::max(unigramProbability,
- bigramProbability + MULTIPLIER_TWO_IN_PROBABILITY_SCALE), MAX_PROBABILITY);
- }
-}
-
/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy) {
return historicalInfo->getLevel() > 0
- || getElapsedTimeStepCount(historicalInfo->getTimeStamp(),
- headerPolicy->getForgettingCurveDurationToLevelDown())
+ || getElapsedTimeStepCount(historicalInfo->getTimestamp(),
+ DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS)
< DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
}
/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave(
const HistoricalInfo *const originalHistoricalInfo,
const HeaderPolicy *const headerPolicy) {
- if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) {
+ if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) {
return HistoricalInfo();
}
- const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown();
+ const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
const int elapsedTimeStep = getElapsedTimeStepCount(
- originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds);
+ originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds);
if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) {
// No need to update historical info.
return *originalHistoricalInfo;
}
- // Level down.
+ // Lower the level.
const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1);
const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ?
originalHistoricalInfo->getLevel() : maxLevelDownAmonut;
- const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() +
+ const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() +
levelDownAmount * durationToLevelDownInSeconds;
return HistoricalInfo(adjustedTimestampInSeconds,
originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */);
}
/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay,
- const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) {
- if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) {
- // Unigram count exceeds the limit.
- return true;
- } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) {
- // Bigram count exceeds the limit.
- return true;
+ const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) {
+ const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts();
+ for (const auto ngramType : AllNgramTypes::ASCENDING) {
+ if (entryCounts.getNgramCount(ngramType)
+ >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) {
+ // Unigram count exceeds the limit.
+ return true;
+ }
}
if (mindsBlockByDecay) {
return false;
@@ -170,7 +165,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT
/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count,
const HeaderPolicy *const headerPolicy) {
- return std::min(std::max(count, 0), headerPolicy->getForgettingCurveOccurrencesToLevelUp() - 1);
+ return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1);
}
/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) {
@@ -187,9 +182,9 @@ const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID =
const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2;
const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3;
const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127;
-const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 32;
-const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 35;
-const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 40;
+const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8;
+const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9;
+const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10;
ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
@@ -202,7 +197,7 @@ ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() {
const float endProbability = getBaseProbabilityForLevel(tableId, level - 1);
for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT;
++timeStepCount) {
- if (level == 0) {
+ if (level < MIN_VISIBLE_LEVEL) {
mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY;
continue;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/dictionary/utils/forgetting_curve_utils.h
index 9910777b8..ddaac7e3b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h
+++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.h
@@ -20,7 +20,8 @@
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/historical_info.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/utils/entry_counters.h"
namespace latinime {
@@ -39,23 +40,20 @@ class ForgettingCurveUtils {
static int decodeProbability(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy);
- static int getProbability(const int encodedUnigramProbability,
- const int encodedBigramProbability);
-
static bool needsToKeep(const HistoricalInfo *const historicalInfo,
const HeaderPolicy *const headerPolicy);
- static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount,
- const int bigramCount, const HeaderPolicy *const headerPolicy);
+ static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters,
+ const HeaderPolicy *const headerPolicy);
- AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) {
- return static_cast<int>(static_cast<float>(maxUnigramCount)
- * UNIGRAM_COUNT_HARD_LIMIT_WEIGHT);
+ // TODO: Improve probability computation method and remove this.
+ static int getProbabilityBiasForNgram(const int n) {
+ return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE;
}
- AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) {
- return static_cast<int>(static_cast<float>(maxBigramCount)
- * BIGRAM_COUNT_HARD_LIMIT_WEIGHT);
+ AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) {
+ return static_cast<int>(static_cast<float>(maxEntryCount)
+ * ENTRY_COUNT_HARD_LIMIT_WEIGHT);
}
private:
@@ -96,9 +94,10 @@ class ForgettingCurveUtils {
static const int MIN_VISIBLE_LEVEL;
static const int MAX_ELAPSED_TIME_STEP_COUNT;
static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD;
+ static const int OCCURRENCES_TO_RAISE_THE_LEVEL;
+ static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS;
- static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT;
- static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT;
+ static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT;
static const ProbabilityTable sProbabilityTable;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/dictionary/utils/format_utils.cpp
index 1916ea560..cef3b094c 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp
+++ b/native/jni/src/dictionary/utils/format_utils.cpp
@@ -14,40 +14,44 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/format_utils.h"
+#include "dictionary/utils/format_utils.h"
-#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h"
+#include "dictionary/utils/byte_array_utils.h"
namespace latinime {
const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE;
// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12
-const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
+const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) {
switch (formatVersion) {
case VERSION_2:
- return VERSION_2;
+ case VERSION_201:
+ AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
+ return UNKNOWN_VERSION;
+ case VERSION_202:
+ return VERSION_202;
case VERSION_4_ONLY_FOR_TESTING:
return VERSION_4_ONLY_FOR_TESTING;
- case VERSION_4:
- return VERSION_4;
- case VERSION_4_DEV:
- return VERSION_4_DEV;
+ case VERSION_402:
+ return VERSION_402;
+ case VERSION_403:
+ return VERSION_403;
default:
return UNKNOWN_VERSION;
}
}
/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion(
- const uint8_t *const dict, const int dictSize) {
+ const ReadOnlyByteArrayView dictBuffer) {
// The magic number is stored big-endian.
// If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't
// understand this format.
- if (dictSize < DICTIONARY_MINIMUM_SIZE) {
+ if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) {
return UNKNOWN_VERSION;
}
- const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0);
+ const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0);
switch (magicNumber) {
case MAGIC_NUMBER:
// The layout of the header is as follows:
@@ -58,7 +62,7 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12;
// Conceptually this converts the hardcoded value of the bytes in the file into
// the symbolic value we use in the code. But we want the constants to be the
// same so we use them for both here.
- return getFormatVersion(ByteArrayUtils::readUint16(dict, 4));
+ return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4));
default:
return UNKNOWN_VERSION;
}
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/dictionary/utils/format_utils.h
index 55ad5799f..1616efcce 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h
+++ b/native/jni/src/dictionary/utils/format_utils.h
@@ -20,6 +20,7 @@
#include <cstdint>
#include "defines.h"
+#include "utils/byte_array_view.h"
namespace latinime {
@@ -30,10 +31,15 @@ class FormatUtils {
public:
enum FORMAT_VERSION {
// These MUST have the same values as the relevant constants in FormatSpec.java.
+ // TODO: Remove VERSION_2 and VERSION_201 when we:
+ // * Confirm that old versions of LatinIME download old-format dictionaries
+ // * We no longer need the corresponding constants on the Java side for dicttool
VERSION_2 = 2,
+ VERSION_201 = 201,
+ VERSION_202 = 202,
VERSION_4_ONLY_FOR_TESTING = 399,
- VERSION_4 = 402,
- VERSION_4_DEV = 403,
+ VERSION_402 = 402,
+ VERSION_403 = 403,
UNKNOWN_VERSION = -1
};
@@ -42,12 +48,12 @@ class FormatUtils {
static const uint32_t MAGIC_NUMBER;
static FORMAT_VERSION getFormatVersion(const int formatVersion);
- static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize);
+ static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils);
- static const int DICTIONARY_MINIMUM_SIZE;
+ static const size_t DICTIONARY_MINIMUM_SIZE;
};
} // namespace latinime
#endif /* LATINIME_FORMAT_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/dictionary/utils/mmapped_buffer.cpp
index 4a126ff85..c5259de6d 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp
+++ b/native/jni/src/dictionary/utils/mmapped_buffer.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h"
+#include "dictionary/utils/mmapped_buffer.h"
#include <cerrno>
#include <climits>
@@ -23,7 +23,7 @@
#include <sys/mman.h>
#include <unistd.h>
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
+#include "dictionary/utils/file_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/dictionary/utils/mmapped_buffer.h
index e25310373..e25310373 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h
+++ b/native/jni/src/dictionary/utils/mmapped_buffer.h
diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp b/native/jni/src/dictionary/utils/multi_bigram_map.cpp
index 91f33a8dd..e730fff8e 100644
--- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp
+++ b/native/jni/src/dictionary/utils/multi_bigram_map.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/core/dictionary/multi_bigram_map.h"
+#include "dictionary/utils/multi_bigram_map.h"
#include <cstddef>
#include <unordered_map>
@@ -35,39 +35,37 @@ const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP =
// Also caches the bigrams if there is space remaining and they have not been cached already.
int MultiBigramMap::getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos, const int nextWordPosition,
+ const WordIdArrayView prevWordIds, const int nextWordId,
const int unigramProbability) {
- if (!prevWordsPtNodePos || prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
+ if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) {
return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY);
}
- std::unordered_map<int, BigramMap>::const_iterator mapPosition =
- mBigramMaps.find(prevWordsPtNodePos[0]);
+ const auto mapPosition = mBigramMaps.find(prevWordIds[0]);
if (mapPosition != mBigramMaps.end()) {
- return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition,
+ return mapPosition->second.getBigramProbability(structurePolicy, nextWordId,
unigramProbability);
}
if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) {
- addBigramsForWordPosition(structurePolicy, prevWordsPtNodePos);
- return mBigramMaps[prevWordsPtNodePos[0]].getBigramProbability(structurePolicy,
- nextWordPosition, unigramProbability);
+ addBigramsForWord(structurePolicy, prevWordIds);
+ return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy,
+ nextWordId, unigramProbability);
}
- return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordsPtNodePos,
- nextWordPosition, unigramProbability);
+ return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds,
+ nextWordId, unigramProbability);
}
void MultiBigramMap::BigramMap::init(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos) {
- structurePolicy->iterateNgramEntries(prevWordsPtNodePos, this /* listener */);
+ const WordIdArrayView prevWordIds) {
+ structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */);
}
int MultiBigramMap::BigramMap::getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int nextWordPosition, const int unigramProbability) const {
+ const int nextWordId, const int unigramProbability) const {
int bigramProbability = NOT_A_PROBABILITY;
- if (mBloomFilter.isInFilter(nextWordPosition)) {
- const std::unordered_map<int, int>::const_iterator bigramProbabilityIt =
- mBigramMap.find(nextWordPosition);
+ if (mBloomFilter.isInFilter(nextWordId)) {
+ const auto bigramProbabilityIt = mBigramMap.find(nextWordId);
if (bigramProbabilityIt != mBigramMap.end()) {
bigramProbability = bigramProbabilityIt->second;
}
@@ -75,29 +73,24 @@ int MultiBigramMap::BigramMap::getBigramProbability(
return structurePolicy->getProbability(unigramProbability, bigramProbability);
}
-void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability,
- const int targetPtNodePos) {
- if (targetPtNodePos == NOT_A_DICT_POS) {
+void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) {
+ if (targetWordId == NOT_A_WORD_ID) {
return;
}
- mBigramMap[targetPtNodePos] = ngramProbability;
- mBloomFilter.setInFilter(targetPtNodePos);
+ mBigramMap[targetWordId] = ngramProbability;
+ mBloomFilter.setInFilter(targetWordId);
}
-void MultiBigramMap::addBigramsForWordPosition(
+void MultiBigramMap::addBigramsForWord(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos) {
- if (prevWordsPtNodePos) {
- mBigramMaps[prevWordsPtNodePos[0]].init(structurePolicy, prevWordsPtNodePos);
- }
+ const WordIdArrayView prevWordIds) {
+ mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds);
}
int MultiBigramMap::readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos, const int nextWordPosition,
- const int unigramProbability) {
- const int bigramProbability = structurePolicy->getProbabilityOfPtNode(prevWordsPtNodePos,
- nextWordPosition);
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) {
+ const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId);
if (bigramProbability != NOT_A_PROBABILITY) {
return bigramProbability;
}
diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/dictionary/utils/multi_bigram_map.h
index ad36dde83..6f23d98bc 100644
--- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h
+++ b/native/jni/src/dictionary/utils/multi_bigram_map.h
@@ -21,10 +21,11 @@
#include <unordered_map>
#include "defines.h"
-#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
-#include "suggest/core/dictionary/bloom_filter.h"
-#include "suggest/core/dictionary/ngram_listener.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/utils/binary_dictionary_bigrams_iterator.h"
+#include "dictionary/utils/bloom_filter.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -39,8 +40,7 @@ class MultiBigramMap {
// Look up the bigram probability for the given word pair from the cached bigram maps.
// Also caches the bigrams if there is space remaining and they have not been cached already.
int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos, const int nextWordPosition,
- const int unigramProbability);
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability);
void clear() {
mBigramMaps.clear();
@@ -58,11 +58,11 @@ class MultiBigramMap {
virtual ~BigramMap() {}
void init(const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos);
+ const WordIdArrayView prevWordIds);
int getBigramProbability(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int nextWordPosition, const int unigramProbability) const;
- virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos);
+ const int nextWordId, const int unigramProbability) const;
+ virtual void onVisitEntry(const int ngramProbability, const int targetWordId);
private:
static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP;
@@ -70,14 +70,12 @@ class MultiBigramMap {
BloomFilter mBloomFilter;
};
- void addBigramsForWordPosition(
- const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos);
+ void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy,
+ const WordIdArrayView prevWordIds);
int readBigramProbabilityFromBinaryDictionary(
const DictionaryStructureWithBufferPolicy *const structurePolicy,
- const int *const prevWordsPtNodePos, const int nextWordPosition,
- const int unigramProbability);
+ const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability);
static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP;
std::unordered_map<int, BigramMap> mBigramMaps;
diff --git a/native/jni/src/dictionary/utils/probability_utils.cpp b/native/jni/src/dictionary/utils/probability_utils.cpp
new file mode 100644
index 000000000..426a0e783
--- /dev/null
+++ b/native/jni/src/dictionary/utils/probability_utils.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dictionary/utils/probability_utils.h"
+
+namespace latinime {
+
+const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f;
+
+} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/dictionary/utils/probability_utils.h
index 3b339e61a..2050af1e9 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h
+++ b/native/jni/src/dictionary/utils/probability_utils.h
@@ -17,6 +17,9 @@
#ifndef LATINIME_PROBABILITY_UTILS_H
#define LATINIME_PROBABILITY_UTILS_H
+#include <algorithm>
+#include <cmath>
+
#include "defines.h"
namespace latinime {
@@ -47,8 +50,20 @@ class ProbabilityUtils {
+ static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize);
}
+ // Encode probability using the same way as we are doing for main dictionaries.
+ static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) {
+ const float probability = static_cast<float>(MAX_PROBABILITY)
+ + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER;
+ if (probability < 0.0f) {
+ return 0;
+ }
+ return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY);
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils);
+
+ static const float PROBABILITY_ENCODING_SCALER;
};
}
#endif /* LATINIME_PROBABILITY_UTILS_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp b/native/jni/src/dictionary/utils/sparse_table.cpp
index d336306b9..029329fab 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp
+++ b/native/jni/src/dictionary/utils/sparse_table.cpp
@@ -14,7 +14,7 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/sparse_table.h"
+#include "dictionary/utils/sparse_table.h"
namespace latinime {
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h b/native/jni/src/dictionary/utils/sparse_table.h
index fca8120f1..bd1190e8b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h
+++ b/native/jni/src/dictionary/utils/sparse_table.h
@@ -20,11 +20,10 @@
#include <cstdint>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
namespace latinime {
-// Note that there is a corresponding implementation in SparseTable.java.
// TODO: Support multiple content buffers.
class SparseTable {
public:
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp b/native/jni/src/dictionary/utils/trie_map.cpp
index 407b8efd0..0bef8c702 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp
+++ b/native/jni/src/dictionary/utils/trie_map.cpp
@@ -14,9 +14,9 @@
* limitations under the License.
*/
-#include "suggest/policyimpl/dictionary/utils/trie_map.h"
+#include "dictionary/utils/trie_map.h"
-#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h"
+#include "dictionary/utils/dict_file_writing_utils.h"
namespace latinime {
@@ -26,6 +26,7 @@ const int TrieMap::FIELD1_SIZE = 3;
const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE;
const uint32_t TrieMap::VALUE_FLAG = 0x400000;
const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF;
+const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK;
const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000;
const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF;
const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5;
@@ -34,6 +35,7 @@ const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_O
const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0;
const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE;
const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0);
+const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry.
const uint64_t TrieMap::MAX_VALUE =
(static_cast<uint64_t>(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1;
const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE;
@@ -76,14 +78,17 @@ int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIn
return terminalEntry.getValueEntryIndex() + 1;
}
// Create a value entry and a bitmap entry.
- const int valueEntryIndex = allocateTable(2 /* entryCount */);
+ const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT);
+ if (valueEntryIndex == INVALID_INDEX) {
+ return INVALID_INDEX;
+ }
if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) {
return INVALID_INDEX;
}
if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) {
return INVALID_INDEX;
}
- if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, valueEntryIndex)) {
+ if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) {
return INVALID_INDEX;
}
return valueEntryIndex + 1;
@@ -108,6 +113,31 @@ bool TrieMap::save(FILE *const file) const {
return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer);
}
+bool TrieMap::remove(const int key, const int bitmapEntryIndex) {
+ const Entry bitmapEntry = readEntry(bitmapEntryIndex);
+ const uint32_t unsignedKey = static_cast<uint32_t>(key);
+ const int terminalEntryIndex = getTerminalEntryIndex(
+ unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */);
+ if (terminalEntryIndex == INVALID_INDEX) {
+ // Not found.
+ return false;
+ }
+ const Entry terminalEntry = readEntry(terminalEntryIndex);
+ if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) {
+ return false;
+ }
+ if (terminalEntry.hasTerminalLink()) {
+ const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1);
+ if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) {
+ return false;
+ }
+ if (!removeInner(nextLevelBitmapEntry)){
+ return false;
+ }
+ }
+ return true;
+}
+
/**
* Iterate next entry in a certain level.
*
@@ -129,7 +159,7 @@ const TrieMap::Result TrieMap::iterateNext(std::vector<TableIterationState> *con
if (entry.isBitmapEntry()) {
// Move to child.
iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex());
- } else {
+ } else if (entry.isValidTerminalEntry()) {
if (outKey) {
*outKey = entry.getKey();
}
@@ -162,12 +192,15 @@ uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const {
}
bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) {
- if (value <= VALUE_MASK) {
+ if (value < VALUE_MASK) {
// Write value into the terminal entry.
return writeField1(value | VALUE_FLAG, terminalEntryIndex);
}
// Create value entry and write value.
- const int valueEntryIndex = allocateTable(2 /* entryCount */);
+ const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT);
+ if (valueEntryIndex == INVALID_INDEX) {
+ return false;
+ }
if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) {
return false;
}
@@ -227,6 +260,9 @@ int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey,
// Move to the next level.
return getTerminalEntryIndex(key, hashedKey, entry, level + 1);
}
+ if (!entry.isValidTerminalEntry()) {
+ return INVALID_INDEX;
+ }
if (entry.getKey() == key) {
// Terminal entry is found.
return entryIndex;
@@ -287,6 +323,10 @@ bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32
// Bitmap entry is found. Go to the next level.
return putInternal(key, value, hashedKey, entryIndex, entry, level + 1);
}
+ if (!entry.isValidTerminalEntry()) {
+ // Overwrite invalid terminal entry.
+ return writeTerminalEntry(key, value, entryIndex);
+ }
if (entry.getKey() == key) {
// Terminal entry for the key is found. Update the value.
return updateValue(entry, value, entryIndex);
@@ -384,4 +424,37 @@ bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t val
return true;
}
+bool TrieMap::removeInner(const Entry &bitmapEntry) {
+ const int tableSize = popCount(bitmapEntry.getBitmap());
+ if (tableSize <= 0) {
+ // The table is empty. No need to remove any entries.
+ return true;
+ }
+ for (int i = 0; i < tableSize; ++i) {
+ const int entryIndex = bitmapEntry.getTableIndex() + i;
+ const Entry entry = readEntry(entryIndex);
+ if (entry.isBitmapEntry()) {
+ // Delete next bitmap entry recursively.
+ if (!removeInner(entry)) {
+ return false;
+ }
+ } else {
+ // Invalidate terminal entry just in case.
+ if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) {
+ return false;
+ }
+ if (entry.hasTerminalLink()) {
+ const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1);
+ if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) {
+ return false;
+ }
+ if (!removeInner(nextLevelBitmapEntry)) {
+ return false;
+ }
+ }
+ }
+ }
+ return true;
+}
+
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h b/native/jni/src/dictionary/utils/trie_map.h
index 3e5c4010c..5fc6c2690 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h
+++ b/native/jni/src/dictionary/utils/trie_map.h
@@ -23,7 +23,7 @@
#include <vector>
#include "defines.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
+#include "dictionary/utils/buffer_with_extendable_buffer.h"
#include "utils/byte_array_view.h"
namespace latinime {
@@ -84,6 +84,10 @@ class TrieMap {
return mValue;
}
+ AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const {
+ return mNextLevelBitmapEntryIndex;
+ }
+
private:
const TrieMap *const mTrieMap;
const int mKey;
@@ -94,7 +98,7 @@ class TrieMap {
TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex)
: mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex),
mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) {
- if (!trieMap) {
+ if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) {
return;
}
const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex);
@@ -202,6 +206,8 @@ class TrieMap {
bool save(FILE *const file) const;
+ bool remove(const int key, const int bitmapEntryIndex);
+
private:
DISALLOW_COPY_AND_ASSIGN(TrieMap);
@@ -245,6 +251,11 @@ class TrieMap {
}
// For terminal entry.
+ AK_FORCE_INLINE bool isValidTerminalEntry() const {
+ return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY);
+ }
+
+ // For terminal entry.
AK_FORCE_INLINE uint32_t getValueEntryIndex() const {
return mData1 & TERMINAL_LINK_MASK;
}
@@ -272,6 +283,7 @@ class TrieMap {
static const int ENTRY_SIZE;
static const uint32_t VALUE_FLAG;
static const uint32_t VALUE_MASK;
+ static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY;
static const uint32_t TERMINAL_LINK_FLAG;
static const uint32_t TERMINAL_LINK_MASK;
static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL;
@@ -280,6 +292,7 @@ class TrieMap {
static const int ROOT_BITMAP_ENTRY_INDEX;
static const int ROOT_BITMAP_ENTRY_POS;
static const Entry EMPTY_BITMAP_ENTRY;
+ static const int TERMINAL_LINKED_ENTRY_COUNT;
static const int MAX_BUFFER_SIZE;
uint32_t getBitShuffledKey(const uint32_t key) const;
@@ -378,6 +391,8 @@ class TrieMap {
AK_FORCE_INLINE int getTailEntryIndex() const {
return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE;
}
+
+ bool removeInner(const Entry &bitmapEntry);
};
} // namespace latinime
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index d1b2c87be..5214077dc 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -26,6 +26,7 @@
#include "suggest/core/dictionary/error_type_utils.h"
#include "suggest/core/layout/proximity_info_state.h"
#include "utils/char_utils.h"
+#include "utils/int_array_view.h"
#if DEBUG_DICT
#define LOGI_SHOW_ADD_COST_PROP \
@@ -103,10 +104,10 @@ class DicNode {
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
}
- // Init for root with prevWordsPtNodePos which is used for n-gram
- void initAsRoot(const int rootPtNodeArrayPos, const int *const prevWordsPtNodePos) {
+ // Init for root with prevWordIds which is used for n-gram
+ void initAsRoot(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) {
mIsCachedForNextSuggestion = false;
- mDicNodeProperties.init(rootPtNodeArrayPos, prevWordsPtNodePos);
+ mDicNodeProperties.init(rootPtNodeArrayPos, prevWordIds);
mDicNodeState.init();
PROF_NODE_RESET(mProfiler);
}
@@ -114,12 +115,11 @@ class DicNode {
// Init for root with previous word
void initAsRootWithPreviousWord(const DicNode *const dicNode, const int rootPtNodeArrayPos) {
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
- int newPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- newPrevWordsPtNodePos[0] = dicNode->mDicNodeProperties.getPtNodePos();
- for (size_t i = 1; i < NELEMS(newPrevWordsPtNodePos); ++i) {
- newPrevWordsPtNodePos[i] = dicNode->getPrevWordsTerminalPtNodePos()[i - 1];
- }
- mDicNodeProperties.init(rootPtNodeArrayPos, newPrevWordsPtNodePos);
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> newPrevWordIds;
+ newPrevWordIds[0] = dicNode->mDicNodeProperties.getWordId();
+ dicNode->getPrevWordIds().limit(newPrevWordIds.size() - 1)
+ .copyToArray(&newPrevWordIds, 1 /* offset */);
+ mDicNodeProperties.init(rootPtNodeArrayPos, WordIdArrayView::fromArray(newPrevWordIds));
mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState,
dicNode->mDicNodeProperties.getDepth());
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
@@ -135,19 +135,16 @@ class DicNode {
PROF_NODE_COPY(&parentDicNode->mProfiler, mProfiler);
}
- void initAsChild(const DicNode *const dicNode, const int ptNodePos,
- const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
- const bool hasChildren, const bool isBlacklistedOrNotAWord,
- const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
+ void initAsChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
+ const int wordId, const CodePointArrayView mergedCodePoints) {
uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1);
mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion;
const uint16_t newLeavingDepth = static_cast<uint16_t>(
- dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount);
- mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0],
- probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth,
- newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos());
- mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount,
- mergedNodeCodePoints);
+ dicNode->mDicNodeProperties.getLeavingDepth() + mergedCodePoints.size());
+ mDicNodeProperties.init(childrenPtNodeArrayPos, mergedCodePoints[0],
+ wordId, newDepth, newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordIds());
+ mDicNodeState.init(&dicNode->mDicNodeState, mergedCodePoints.size(),
+ mergedCodePoints.data());
PROF_NODE_COPY(&dicNode->mProfiler, mProfiler);
}
@@ -179,9 +176,6 @@ class DicNode {
// Check if the current word and the previous word can be considered as a valid multiple word
// suggestion.
bool isValidMultipleWordSuggestion() const {
- if (isBlacklistedOrNotAWord()) {
- return false;
- }
// Treat suggestion as invalid if the current and the previous word are single character
// words.
const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength()
@@ -204,13 +198,12 @@ class DicNode {
}
// Used to get n-gram probability in DicNodeUtils.
- int getPtNodePos() const {
- return mDicNodeProperties.getPtNodePos();
+ int getWordId() const {
+ return mDicNodeProperties.getWordId();
}
- // TODO: Use view class to return PtNodePos array.
- const int *getPrevWordsTerminalPtNodePos() const {
- return mDicNodeProperties.getPrevWordsTerminalPtNodePos();
+ const WordIdArrayView getPrevWordIds() const {
+ return mDicNodeProperties.getPrevWordIds();
}
// Used in DicNodeUtils
@@ -218,10 +211,6 @@ class DicNode {
return mDicNodeProperties.getChildrenPtNodeArrayPos();
}
- int getProbability() const {
- return mDicNodeProperties.getProbability();
- }
-
AK_FORCE_INLINE bool isTerminalDicNode() const {
const bool isTerminalPtNode = mDicNodeProperties.isTerminal();
const int currentDicNodeDepth = getNodeCodePointCount();
@@ -306,8 +295,9 @@ class DicNode {
}
// Used to prune nodes
- float getCompoundDistance(const float languageWeight) const {
- return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight);
+ float getCompoundDistance(const float weightOfLangModelVsSpatialModel) const {
+ return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(
+ weightOfLangModelVsSpatialModel);
}
AK_FORCE_INLINE const int *getOutputWordBuf() const {
@@ -404,10 +394,6 @@ class DicNode {
return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes();
}
- bool isBlacklistedOrNotAWord() const {
- return mDicNodeProperties.isBlacklistedOrNotAWord();
- }
-
inline uint16_t getNodeCodePointCount() const {
return mDicNodeProperties.getDepth();
}
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
index 69ea67418..a20252cd2 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
+++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp
@@ -16,10 +16,9 @@
#include "suggest/core/dicnode/dic_node_utils.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_vector.h"
-#include "suggest/core/dictionary/multi_bigram_map.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
namespace latinime {
@@ -29,8 +28,8 @@ namespace latinime {
/* static */ void DicNodeUtils::initAsRoot(
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const int *const prevWordsPtNodePos, DicNode *const newRootDicNode) {
- newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordsPtNodePos);
+ const WordIdArrayView prevWordIds, DicNode *const newRootDicNode) {
+ newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordIds);
}
/*static */ void DicNodeUtils::initAsRootWithPreviousWord(
@@ -73,25 +72,17 @@ namespace latinime {
if (dicNode->hasMultipleWords() && !dicNode->isValidMultipleWordSuggestion()) {
return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
}
- const int probability = getBigramNodeProbability(dictionaryStructurePolicy, dicNode,
- multiBigramMap);
+ const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext(
+ dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap);
+ if (wordAttributes.getProbability() == NOT_A_PROBABILITY
+ || (dicNode->hasMultipleWords()
+ && (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()))) {
+ return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
+ }
// TODO: This equation to calculate the improbability looks unreasonable. Investigate this.
- const float cost = static_cast<float>(MAX_PROBABILITY - probability)
+ const float cost = static_cast<float>(MAX_PROBABILITY - wordAttributes.getProbability())
/ static_cast<float>(MAX_PROBABILITY);
return cost;
}
-/* static */ int DicNodeUtils::getBigramNodeProbability(
- const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) {
- const int unigramProbability = dicNode->getProbability();
- if (multiBigramMap) {
- const int *const prevWordsPtNodePos = dicNode->getPrevWordsTerminalPtNodePos();
- return multiBigramMap->getBigramProbability(dictionaryStructurePolicy,
- prevWordsPtNodePos, dicNode->getPtNodePos(), unigramProbability);
- }
- return dictionaryStructurePolicy->getProbability(unigramProbability,
- NOT_A_PROBABILITY);
-}
-
} // namespace latinime
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h
index 00e80c604..b891a842a 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h
@@ -18,6 +18,7 @@
#define LATINIME_DIC_NODE_UTILS_H
#include "defines.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -30,7 +31,7 @@ class DicNodeUtils {
public:
static void initAsRoot(
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const int *const prevWordPtNodePos, DicNode *const newRootDicNode);
+ const WordIdArrayView prevWordIds, DicNode *const newRootDicNode);
static void initAsRootWithPreviousWord(
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode);
@@ -46,10 +47,6 @@ class DicNodeUtils {
DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils);
// Max number of bigrams to look up
static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500;
-
- static int getBigramNodeProbability(
- const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const DicNode *const dicNode, MultiBigramMap *const multiBigramMap);
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_UTILS_H
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h
index 54cde1988..e6b758954 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h
@@ -21,6 +21,7 @@
#include "defines.h"
#include "suggest/core/dicnode/dic_node.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -58,15 +59,11 @@ class DicNodeVector {
mDicNodes.back().initAsPassingChild(dicNode);
}
- void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos,
- const int childrenPtNodeArrayPos, const int probability, const bool isTerminal,
- const bool hasChildren, const bool isBlacklistedOrNotAWord,
- const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) {
+ void pushLeavingChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos,
+ const int wordId, const CodePointArrayView mergedCodePoints) {
ASSERT(!mLock);
mDicNodes.emplace_back();
- mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability,
- isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount,
- mergedNodeCodePoints);
+ mDicNodes.back().initAsChild(dicNode, childrenPtNodeArrayPos, wordId, mergedCodePoints);
}
DicNode *operator[](const int id) {
diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h
index 8202176f7..1b796b5d4 100644
--- a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h
+++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h
@@ -18,8 +18,10 @@
#define LATINIME_DIC_NODE_PROPERTIES_H
#include <cstdint>
+#include <cstdlib>
#include "defines.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -29,84 +31,61 @@ namespace latinime {
class DicNodeProperties {
public:
AK_FORCE_INLINE DicNodeProperties()
- : mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS),
- mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT),
- mIsTerminal(false), mHasChildrenPtNodes(false),
- mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {}
+ : mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mDicNodeCodePoint(NOT_A_CODE_POINT),
+ mWordId(NOT_A_WORD_ID), mDepth(0), mLeavingDepth(0), mPrevWordCount(0) {}
~DicNodeProperties() {}
// Should be called only once per DicNode is initialized.
- void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability,
- const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord,
- const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) {
- mPtNodePos = pos;
+ void init(const int childrenPos, const int nodeCodePoint, const int wordId,
+ const uint16_t depth, const uint16_t leavingDepth, const WordIdArrayView prevWordIds) {
mChildrenPtNodeArrayPos = childrenPos;
mDicNodeCodePoint = nodeCodePoint;
- mProbability = probability;
- mIsTerminal = isTerminal;
- mHasChildrenPtNodes = hasChildren;
- mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord;
+ mWordId = wordId;
mDepth = depth;
mLeavingDepth = leavingDepth;
- memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos));
+ prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */);
+ mPrevWordCount = prevWordIds.size();
}
// Init for root with prevWordsPtNodePos which is used for n-gram
- void init(const int rootPtNodeArrayPos, const int *const prevWordsNodePos) {
- mPtNodePos = NOT_A_DICT_POS;
+ void init(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) {
mChildrenPtNodeArrayPos = rootPtNodeArrayPos;
mDicNodeCodePoint = NOT_A_CODE_POINT;
- mProbability = NOT_A_PROBABILITY;
- mIsTerminal = false;
- mHasChildrenPtNodes = true;
- mIsBlacklistedOrNotAWord = false;
+ mWordId = NOT_A_WORD_ID;
mDepth = 0;
mLeavingDepth = 0;
- memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos));
+ prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */);
+ mPrevWordCount = prevWordIds.size();
}
void initByCopy(const DicNodeProperties *const dicNodeProp) {
- mPtNodePos = dicNodeProp->mPtNodePos;
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint;
- mProbability = dicNodeProp->mProbability;
- mIsTerminal = dicNodeProp->mIsTerminal;
- mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
- mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
+ mWordId = dicNodeProp->mWordId;
mDepth = dicNodeProp->mDepth;
mLeavingDepth = dicNodeProp->mLeavingDepth;
- memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos,
- sizeof(mPrevWordsTerminalPtNodePos));
+ const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds();
+ prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */);
+ mPrevWordCount = prevWordIdArrayView.size();
}
// Init as passing child
void init(const DicNodeProperties *const dicNodeProp, const int codePoint) {
- mPtNodePos = dicNodeProp->mPtNodePos;
mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos;
mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child
- mProbability = dicNodeProp->mProbability;
- mIsTerminal = dicNodeProp->mIsTerminal;
- mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes;
- mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord;
+ mWordId = dicNodeProp->mWordId;
mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child
mLeavingDepth = dicNodeProp->mLeavingDepth;
- memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos,
- sizeof(mPrevWordsTerminalPtNodePos));
- }
-
- int getPtNodePos() const {
- return mPtNodePos;
+ const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds();
+ prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */);
+ mPrevWordCount = prevWordIdArrayView.size();
}
int getChildrenPtNodeArrayPos() const {
return mChildrenPtNodeArrayPos;
}
- int getProbability() const {
- return mProbability;
- }
-
int getDicNodeCodePoint() const {
return mDicNodeCodePoint;
}
@@ -121,35 +100,32 @@ class DicNodeProperties {
}
bool isTerminal() const {
- return mIsTerminal;
+ return mWordId != NOT_A_WORD_ID;
}
bool hasChildren() const {
- return mHasChildrenPtNodes || mDepth != mLeavingDepth;
+ return (mChildrenPtNodeArrayPos != NOT_A_DICT_POS) || mDepth != mLeavingDepth;
}
- bool isBlacklistedOrNotAWord() const {
- return mIsBlacklistedOrNotAWord;
+ const WordIdArrayView getPrevWordIds() const {
+ return WordIdArrayView::fromArray(mPrevWordIds).limit(mPrevWordCount);
}
- const int *getPrevWordsTerminalPtNodePos() const {
- return mPrevWordsTerminalPtNodePos;
+ int getWordId() const {
+ return mWordId;
}
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
// for this class
- int mPtNodePos;
int mChildrenPtNodeArrayPos;
- int mProbability;
int mDicNodeCodePoint;
- bool mIsTerminal;
- bool mHasChildrenPtNodes;
- bool mIsBlacklistedOrNotAWord;
+ int mWordId;
uint16_t mDepth;
uint16_t mLeavingDepth;
- int mPrevWordsTerminalPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordIds;
+ size_t mPrevWordCount;
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_PROPERTIES_H
diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
index c19d48eb9..3a54c2599 100644
--- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
+++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h
@@ -103,8 +103,10 @@ class DicNodeStateScoring {
return getCompoundDistance(1.0f);
}
- float getCompoundDistance(const float languageWeight) const {
- return mSpatialDistance + mLanguageDistance * languageWeight;
+ float getCompoundDistance(
+ const float weightOfLangModelVsSpatialModel) const {
+ return mSpatialDistance
+ + mLanguageDistance * weightOfLangModelVsSpatialModel;
}
float getNormalizedCompoundDistance() const {
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp
index d62573970..5c9a1392e 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp
@@ -19,15 +19,16 @@
#include "suggest/core/dictionary/dictionary.h"
#include "defines.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/property/ngram_context.h"
#include "suggest/core/dictionary/dictionary_utils.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
#include "suggest/core/result/suggestion_results.h"
#include "suggest/core/session/dic_traverse_session.h"
-#include "suggest/core/session/prev_words_info.h"
#include "suggest/core/suggest.h"
#include "suggest/core/suggest_options.h"
#include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h"
#include "suggest/policyimpl/typing/typing_suggest_policy_factory.h"
+#include "utils/int_array_view.h"
#include "utils/log_utils.h"
#include "utils/time_keeper.h"
@@ -45,88 +46,87 @@ Dictionary::Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::Structu
void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession,
int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints,
- int inputSize, const PrevWordsInfo *const prevWordsInfo,
- const SuggestOptions *const suggestOptions, const float languageWeight,
+ int inputSize, const NgramContext *const ngramContext,
+ const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults) const {
TimeKeeper::setCurrentTime();
- traverseSession->init(this, prevWordsInfo, suggestOptions);
+ traverseSession->init(this, ngramContext, suggestOptions);
const auto &suggest = suggestOptions->isGesture() ? mGestureSuggest : mTypingSuggest;
suggest->getSuggestions(proximityInfo, traverseSession, xcoordinates,
ycoordinates, times, pointerIds, inputCodePoints, inputSize,
- languageWeight, outSuggestionResults);
- if (DEBUG_DICT) {
- outSuggestionResults->dumpSuggestions();
- }
+ weightOfLangModelVsSpatialModel, outSuggestionResults);
}
Dictionary::NgramListenerForPrediction::NgramListenerForPrediction(
- const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const suggestionResults,
+ const NgramContext *const ngramContext, const WordIdArrayView prevWordIds,
+ SuggestionResults *const suggestionResults,
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy)
- : mPrevWordsInfo(prevWordsInfo), mSuggestionResults(suggestionResults),
- mDictStructurePolicy(dictStructurePolicy) {}
+ : mNgramContext(ngramContext), mPrevWordIds(prevWordIds),
+ mSuggestionResults(suggestionResults), mDictStructurePolicy(dictStructurePolicy) {}
void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability,
- const int targetPtNodePos) {
- if (targetPtNodePos == NOT_A_DICT_POS) {
+ const int targetWordId) {
+ if (targetWordId == NOT_A_WORD_ID) {
return;
}
- if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)
+ if (mNgramContext->isNthPrevWordBeginningOfSentence(1 /* n */)
&& ngramProbability == NOT_A_PROBABILITY) {
return;
}
int targetWordCodePoints[MAX_WORD_LENGTH];
- int unigramProbability = 0;
- const int codePointCount = mDictStructurePolicy->
- getCodePointsAndProbabilityAndReturnCodePointCount(targetPtNodePos,
- MAX_WORD_LENGTH, targetWordCodePoints, &unigramProbability);
+ const int codePointCount = mDictStructurePolicy->getCodePointsAndReturnCodePointCount(
+ targetWordId, MAX_WORD_LENGTH, targetWordCodePoints);
if (codePointCount <= 0) {
return;
}
- const int probability = mDictStructurePolicy->getProbability(
- unigramProbability, ngramProbability);
- mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, probability);
+ const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext(
+ mPrevWordIds, targetWordId, nullptr /* multiBigramMap */);
+ if (wordAttributes.getProbability() == NOT_A_PROBABILITY) {
+ return;
+ }
+ mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount,
+ wordAttributes.getProbability());
}
-void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo,
+void Dictionary::getPredictions(const NgramContext *const ngramContext,
SuggestionResults *const outSuggestionResults) const {
TimeKeeper::setCurrentTime();
- NgramListenerForPrediction listener(prevWordsInfo, outSuggestionResults,
- mDictionaryStructureWithBufferPolicy.get());
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(
- mDictionaryStructureWithBufferPolicy.get(), prevWordsPtNodePos,
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(
+ mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray,
true /* tryLowerCaseSearch */);
- mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordsPtNodePos, &listener);
+ NgramListenerForPrediction listener(ngramContext, prevWordIds, outSuggestionResults,
+ mDictionaryStructureWithBufferPolicy.get());
+ mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordIds, &listener);
}
-int Dictionary::getProbability(const int *word, int length) const {
- return getNgramProbability(nullptr /* prevWordsInfo */, word, length);
+int Dictionary::getProbability(const CodePointArrayView codePoints) const {
+ return getNgramProbability(nullptr /* ngramContext */, codePoints);
}
-int Dictionary::getMaxProbabilityOfExactMatches(const int *word, int length) const {
+int Dictionary::getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const {
TimeKeeper::setCurrentTime();
return DictionaryUtils::getMaxProbabilityOfExactMatches(
- mDictionaryStructureWithBufferPolicy.get(), word, length);
+ mDictionaryStructureWithBufferPolicy.get(), codePoints);
}
-int Dictionary::getNgramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word,
- int length) const {
+int Dictionary::getNgramProbability(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints) const {
TimeKeeper::setCurrentTime();
- int nextWordPos = mDictionaryStructureWithBufferPolicy->getTerminalPtNodePositionOfWord(word,
- length, false /* forceLowerCaseSearch */);
- if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY;
- if (!prevWordsInfo) {
- return getDictionaryStructurePolicy()->getProbabilityOfPtNode(
- nullptr /* prevWordsPtNodePos */, nextWordPos);
+ const int wordId = mDictionaryStructureWithBufferPolicy->getWordId(codePoints,
+ false /* forceLowerCaseSearch */);
+ if (wordId == NOT_A_WORD_ID) return NOT_A_PROBABILITY;
+ if (!ngramContext) {
+ return getDictionaryStructurePolicy()->getProbabilityOfWord(WordIdArrayView(), wordId);
}
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(
- mDictionaryStructureWithBufferPolicy.get(), prevWordsPtNodePos,
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(
+ mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray,
true /* tryLowerCaseSearch */);
- return getDictionaryStructurePolicy()->getProbabilityOfPtNode(prevWordsPtNodePos, nextWordPos);
+ return getDictionaryStructurePolicy()->getProbabilityOfWord(prevWordIds, wordId);
}
-bool Dictionary::addUnigramEntry(const int *const word, const int length,
+bool Dictionary::addUnigramEntry(const CodePointArrayView codePoints,
const UnigramProperty *const unigramProperty) {
if (unigramProperty->representsBeginningOfSentence()
&& !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy()
@@ -135,24 +135,31 @@ bool Dictionary::addUnigramEntry(const int *const word, const int length,
return false;
}
TimeKeeper::setCurrentTime();
- return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty);
+ return mDictionaryStructureWithBufferPolicy->addUnigramEntry(codePoints, unigramProperty);
+}
+
+bool Dictionary::removeUnigramEntry(const CodePointArrayView codePoints) {
+ TimeKeeper::setCurrentTime();
+ return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints);
}
-bool Dictionary::removeUnigramEntry(const int *const codePoints, const int codePointCount) {
+bool Dictionary::addNgramEntry(const NgramProperty *const ngramProperty) {
TimeKeeper::setCurrentTime();
- return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints, codePointCount);
+ return mDictionaryStructureWithBufferPolicy->addNgramEntry(ngramProperty);
}
-bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty) {
+bool Dictionary::removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints) {
TimeKeeper::setCurrentTime();
- return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty);
+ return mDictionaryStructureWithBufferPolicy->removeNgramEntry(ngramContext, codePoints);
}
-bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const int *const word, const int length) {
+bool Dictionary::updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo) {
TimeKeeper::setCurrentTime();
- return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length);
+ return mDictionaryStructureWithBufferPolicy->updateEntriesForWordWithNgramContext(ngramContext,
+ codePoints, isValidWord, historicalInfo);
}
bool Dictionary::flush(const char *const filePath) {
@@ -177,11 +184,9 @@ void Dictionary::getProperty(const char *const query, const int queryLength, cha
maxResultLength);
}
-const WordProperty Dictionary::getWordProperty(const int *const codePoints,
- const int codePointCount) {
+const WordProperty Dictionary::getWordProperty(const CodePointArrayView codePoints) {
TimeKeeper::setCurrentTime();
- return mDictionaryStructureWithBufferPolicy->getWordProperty(
- codePoints, codePointCount);
+ return mDictionaryStructureWithBufferPolicy->getWordProperty(codePoints);
}
int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints,
diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h
index 732d3b199..9e224ebfb 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary.h
@@ -21,17 +21,19 @@
#include "defines.h"
#include "jni.h"
-#include "suggest/core/dictionary/ngram_listener.h"
-#include "suggest/core/dictionary/property/word_property.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/interface/ngram_listener.h"
+#include "dictionary/property/historical_info.h"
+#include "dictionary/property/word_property.h"
#include "suggest/core/suggest_interface.h"
+#include "utils/int_array_view.h"
namespace latinime {
class DictionaryStructureWithBufferPolicy;
class DicTraverseSession;
-class PrevWordsInfo;
+class NgramContext;
class ProximityInfo;
class SuggestionResults;
class SuggestOptions;
@@ -58,36 +60,40 @@ class Dictionary {
static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000;
static const int KIND_FLAG_EXACT_MATCH = 0x40000000;
static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000;
+ static const int KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION = 0x10000000;
Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr
dictionaryStructureWithBufferPolicy);
void getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession,
int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints,
- int inputSize, const PrevWordsInfo *const prevWordsInfo,
- const SuggestOptions *const suggestOptions, const float languageWeight,
+ int inputSize, const NgramContext *const ngramContext,
+ const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults) const;
- void getPredictions(const PrevWordsInfo *const prevWordsInfo,
+ void getPredictions(const NgramContext *const ngramContext,
SuggestionResults *const outSuggestionResults) const;
- int getProbability(const int *word, int length) const;
+ int getProbability(const CodePointArrayView codePoints) const;
- int getMaxProbabilityOfExactMatches(const int *word, int length) const;
+ int getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const;
- int getNgramProbability(const PrevWordsInfo *const prevWordsInfo,
- const int *word, int length) const;
+ int getNgramProbability(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints) const;
- bool addUnigramEntry(const int *const codePoints, const int codePointCount,
+ bool addUnigramEntry(const CodePointArrayView codePoints,
const UnigramProperty *const unigramProperty);
- bool removeUnigramEntry(const int *const codePoints, const int codePointCount);
+ bool removeUnigramEntry(const CodePointArrayView codePoints);
- bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty);
+ bool addNgramEntry(const NgramProperty *const ngramProperty);
- bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word,
- const int length);
+ bool removeNgramEntry(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints);
+
+ bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext,
+ const CodePointArrayView codePoints, const bool isValidWord,
+ const HistoricalInfo historicalInfo);
bool flush(const char *const filePath);
@@ -98,7 +104,7 @@ class Dictionary {
void getProperty(const char *const query, const int queryLength, char *const outResult,
const int maxResultLength);
- const WordProperty getWordProperty(const int *const codePoints, const int codePointCount);
+ const WordProperty getWordProperty(const CodePointArrayView codePoints);
// Method to iterate all words in the dictionary.
// The returned token has to be used to get the next word. If token is 0, this method newly
@@ -117,15 +123,16 @@ class Dictionary {
class NgramListenerForPrediction : public NgramListener {
public:
- NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo,
- SuggestionResults *const suggestionResults,
+ NgramListenerForPrediction(const NgramContext *const ngramContext,
+ const WordIdArrayView prevWordIds, SuggestionResults *const suggestionResults,
const DictionaryStructureWithBufferPolicy *const dictStructurePolicy);
- virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos);
+ virtual void onVisitEntry(const int ngramProbability, const int targetWordId);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction);
- const PrevWordsInfo *const mPrevWordsInfo;
+ const NgramContext *const mNgramContext;
+ const WordIdArrayView mPrevWordIds;
SuggestionResults *const mSuggestionResults;
const DictionaryStructureWithBufferPolicy *const mDictStructurePolicy;
};
diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp
index b94966cbe..7de550026 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp
+++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp
@@ -16,39 +16,40 @@
#include "suggest/core/dictionary/dictionary_utils.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/property/ngram_context.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_priority_queue.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/dictionary.h"
#include "suggest/core/dictionary/digraph_utils.h"
-#include "suggest/core/session/prev_words_info.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
+#include "utils/int_array_view.h"
namespace latinime {
/* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches(
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const int *const codePoints, const int codePointCount) {
+ const CodePointArrayView codePoints) {
std::vector<DicNode> current;
std::vector<DicNode> next;
- // No prev words information.
- PrevWordsInfo emptyPrevWordsInfo;
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- emptyPrevWordsInfo.getPrevWordsTerminalPtNodePos(dictionaryStructurePolicy,
- prevWordsPtNodePos, false /* tryLowerCaseSearch */);
+ // No ngram context.
+ NgramContext emptyNgramContext;
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray;
+ const WordIdArrayView prevWordIds = emptyNgramContext.getPrevWordIds(
+ dictionaryStructurePolicy, &prevWordIdArray, false /* tryLowerCaseSearch */);
current.emplace_back();
- DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordsPtNodePos, &current.front());
- for (int i = 0; i < codePointCount; ++i) {
+ DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordIds, &current.front());
+ for (const int codePoint : codePoints) {
// The base-lower input is used to ignore case errors and accent errors.
- const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]);
+ const int baseLowerCodePoint = CharUtils::toBaseLowerCase(codePoint);
for (const DicNode &dicNode : current) {
- if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == codePoint) {
+ if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == baseLowerCodePoint) {
next.emplace_back(dicNode);
next.back().advanceDigraphIndex();
continue;
}
- processChildDicNodes(dictionaryStructurePolicy, codePoint, &dicNode, &next);
+ processChildDicNodes(dictionaryStructurePolicy, baseLowerCodePoint, &dicNode, &next);
}
current.clear();
current.swap(next);
@@ -59,8 +60,11 @@ namespace latinime {
if (!dicNode.isTerminalDicNode()) {
continue;
}
+ const WordAttributes wordAttributes =
+ dictionaryStructurePolicy->getWordAttributesInContext(dicNode.getPrevWordIds(),
+ dicNode.getWordId(), nullptr /* multiBigramMap */);
// dicNode can contain case errors, accent errors, intentional omissions or digraphs.
- maxProbability = std::max(maxProbability, dicNode.getProbability());
+ maxProbability = std::max(maxProbability, wordAttributes.getProbability());
}
return maxProbability;
}
diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.h b/native/jni/src/suggest/core/dictionary/dictionary_utils.h
index 358ebf674..4dd21c9be 100644
--- a/native/jni/src/suggest/core/dictionary/dictionary_utils.h
+++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.h
@@ -20,6 +20,7 @@
#include <vector>
#include "defines.h"
+#include "utils/int_array_view.h"
namespace latinime {
@@ -30,7 +31,7 @@ class DictionaryUtils {
public:
static int getMaxProbabilityOfExactMatches(
const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy,
- const int *const codePoints, const int codePointCount);
+ const CodePointArrayView codePoints);
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryUtils);
diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
index bb2ce5012..4d68f620f 100644
--- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
+++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp
@@ -19,7 +19,7 @@
#include <cstdlib>
#include "defines.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
#include "utils/char_utils.h"
namespace latinime {
diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
index b6bf7a98c..61093e174 100644
--- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
+++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp
@@ -19,17 +19,20 @@
namespace latinime {
const ErrorTypeUtils::ErrorType ErrorTypeUtils::NOT_AN_ERROR = 0x0;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_CASE_ERROR = 0x1;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR = 0x2;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x4;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x8;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x10;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x20;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x40;
-const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_CASE = 0x1;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT = 0x2;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT = 0x4;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT = 0x8;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x10;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x20;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x40;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x80;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x100;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x200;
const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH =
- NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH;
+ NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH;
+const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR;
const ErrorTypeUtils::ErrorType
ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION =
diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h
index e3e76b238..75111ba75 100644
--- a/native/jni/src/suggest/core/dictionary/error_type_utils.h
+++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h
@@ -30,8 +30,10 @@ class ErrorTypeUtils {
typedef uint32_t ErrorType;
static const ErrorType NOT_AN_ERROR;
- static const ErrorType MATCH_WITH_CASE_ERROR;
- static const ErrorType MATCH_WITH_ACCENT_ERROR;
+ static const ErrorType MATCH_WITH_WRONG_CASE;
+ static const ErrorType MATCH_WITH_MISSING_ACCENT;
+ static const ErrorType MATCH_WITH_MISSING_EXPLICIT_ACCENT;
+ static const ErrorType MATCH_WITH_WRONG_ACCENT;
static const ErrorType MATCH_WITH_DIGRAPH;
// Treat error as an intentional omission when the CorrectionType is omission and the node can
// be intentional omission.
@@ -51,11 +53,19 @@ class ErrorTypeUtils {
return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0;
}
+ static bool isPerfectMatch(const ErrorType containedErrorTypes) {
+ return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0;
+ }
+
static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) {
return (containedErrorTypes
& ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0;
}
+ static bool isMissingExplicitAccent(const ErrorType errorType) {
+ return (errorType & MATCH_WITH_MISSING_EXPLICIT_ACCENT) != 0;
+ }
+
static bool isEditCorrectionError(const ErrorType errorType) {
return (errorType & EDIT_CORRECTION) != 0;
}
@@ -72,6 +82,7 @@ class ErrorTypeUtils {
DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils);
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH;
+ static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH;
static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION;
};
} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/property/bigram_property.h b/native/jni/src/suggest/core/dictionary/property/bigram_property.h
deleted file mode 100644
index 343af143c..000000000
--- a/native/jni/src/suggest/core/dictionary/property/bigram_property.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_BIGRAM_PROPERTY_H
-#define LATINIME_BIGRAM_PROPERTY_H
-
-#include <vector>
-
-#include "defines.h"
-
-namespace latinime {
-
-// TODO: Change to NgramProperty.
-class BigramProperty {
- public:
- BigramProperty(const std::vector<int> *const targetCodePoints,
- const int probability, const int timestamp, const int level, const int count)
- : mTargetCodePoints(*targetCodePoints), mProbability(probability),
- mTimestamp(timestamp), mLevel(level), mCount(count) {}
-
- const std::vector<int> *getTargetCodePoints() const {
- return &mTargetCodePoints;
- }
-
- int getProbability() const {
- return mProbability;
- }
-
- int getTimestamp() const {
- return mTimestamp;
- }
-
- int getLevel() const {
- return mLevel;
- }
-
- int getCount() const {
- return mCount;
- }
-
- private:
- // Default copy constructor and assign operator are used for using in std::vector.
- DISALLOW_DEFAULT_CONSTRUCTOR(BigramProperty);
-
- // TODO: Make members const.
- std::vector<int> mTargetCodePoints;
- int mProbability;
- int mTimestamp;
- int mLevel;
- int mCount;
-};
-} // namespace latinime
-#endif // LATINIME_WORD_PROPERTY_H
diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h
deleted file mode 100644
index 902eb000f..000000000
--- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_UNIGRAM_PROPERTY_H
-#define LATINIME_UNIGRAM_PROPERTY_H
-
-#include <vector>
-
-#include "defines.h"
-
-namespace latinime {
-
-class UnigramProperty {
- public:
- class ShortcutProperty {
- public:
- ShortcutProperty(const std::vector<int> *const targetCodePoints, const int probability)
- : mTargetCodePoints(*targetCodePoints), mProbability(probability) {}
-
- const std::vector<int> *getTargetCodePoints() const {
- return &mTargetCodePoints;
- }
-
- int getProbability() const {
- return mProbability;
- }
-
- private:
- // Default copy constructor and assign operator are used for using in std::vector.
- DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty);
-
- // TODO: Make members const.
- std::vector<int> mTargetCodePoints;
- int mProbability;
- };
-
- UnigramProperty()
- : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false),
- mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0),
- mShortcuts() {}
-
- UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord,
- const bool isBlacklisted, const int probability, const int timestamp, const int level,
- const int count, const std::vector<ShortcutProperty> *const shortcuts)
- : mRepresentsBeginningOfSentence(representsBeginningOfSentence),
- mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability),
- mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {}
-
- bool representsBeginningOfSentence() const {
- return mRepresentsBeginningOfSentence;
- }
-
- bool isNotAWord() const {
- return mIsNotAWord;
- }
-
- bool isBlacklisted() const {
- return mIsBlacklisted;
- }
-
- bool hasShortcuts() const {
- return !mShortcuts.empty();
- }
-
- int getProbability() const {
- return mProbability;
- }
-
- int getTimestamp() const {
- return mTimestamp;
- }
-
- int getLevel() const {
- return mLevel;
- }
-
- int getCount() const {
- return mCount;
- }
-
- const std::vector<ShortcutProperty> &getShortcuts() const {
- return mShortcuts;
- }
-
- private:
- // Default copy constructor is used for using as a return value.
- DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty);
-
- // TODO: Make members const.
- bool mRepresentsBeginningOfSentence;
- bool mIsNotAWord;
- bool mIsBlacklisted;
- int mProbability;
- // Historical information
- int mTimestamp;
- int mLevel;
- int mCount;
- std::vector<ShortcutProperty> mShortcuts;
-};
-} // namespace latinime
-#endif // LATINIME_UNIGRAM_PROPERTY_H
diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp
deleted file mode 100644
index 5bdd5606b..000000000
--- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/core/dictionary/property/word_property.h"
-
-#include "utils/jni_data_utils.h"
-
-namespace latinime {
-
-void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
- jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
- jobject outBigramProbabilities, jobject outShortcutTargets,
- jobject outShortcutProbabilities) const {
- JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
- MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
- false /* needsNullTermination */);
- jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(),
- !mBigrams.empty(), mUnigramProperty.hasShortcuts(),
- mUnigramProperty.representsBeginningOfSentence()};
- env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
- int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(),
- mUnigramProperty.getLevel(), mUnigramProperty.getCount()};
- env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo),
- probabilityInfo);
-
- jclass integerClass = env->FindClass("java/lang/Integer");
- jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V");
- jclass arrayListClass = env->FindClass("java/util/ArrayList");
- jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
-
- // Output bigrams.
- for (const auto &bigramProperty : mBigrams) {
- const std::vector<int> *const word1CodePoints = bigramProperty.getTargetCodePoints();
- jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
- JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
- word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
- false /* needsNullTermination */);
- env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
- env->DeleteLocalRef(bigramWord1CodePointArray);
-
- int bigramProbabilityInfo[] = {bigramProperty.getProbability(),
- bigramProperty.getTimestamp(), bigramProperty.getLevel(),
- bigramProperty.getCount()};
- jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
- env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
- NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
- env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray);
- env->DeleteLocalRef(bigramProbabilityInfoArray);
- }
-
- // Output shortcuts.
- for (const auto &shortcut : mUnigramProperty.getShortcuts()) {
- const std::vector<int> *const targetCodePoints = shortcut.getTargetCodePoints();
- jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size());
- env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */,
- targetCodePoints->size(), targetCodePoints->data());
- JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */,
- targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(),
- false /* needsNullTermination */);
- env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray);
- env->DeleteLocalRef(shortcutTargetCodePointArray);
- jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId,
- shortcut.getProbability());
- env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability);
- env->DeleteLocalRef(integerProbability);
- }
- env->DeleteLocalRef(integerClass);
- env->DeleteLocalRef(arrayListClass);
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp b/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp
index 34b8b37b0..8b39f7da5 100644
--- a/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp
+++ b/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp
@@ -19,7 +19,7 @@
namespace latinime {
// TODO: Stop using hardcoded additional proximity characters.
// TODO: Have proximity character informations in each language's binary dictionary.
-const char *AdditionalProximityChars::LOCALE_EN_US = "en";
+const int AdditionalProximityChars::LOCALE_EN_US[LOCALE_EN_US_SIZE] = { 'e', 'n' };
const int AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = {
'e', 'i', 'o', 'u'
diff --git a/native/jni/src/suggest/core/layout/additional_proximity_chars.h b/native/jni/src/suggest/core/layout/additional_proximity_chars.h
index a88fd6cea..2260be9bd 100644
--- a/native/jni/src/suggest/core/layout/additional_proximity_chars.h
+++ b/native/jni/src/suggest/core/layout/additional_proximity_chars.h
@@ -18,6 +18,7 @@
#define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H
#include <cstring>
+#include <vector>
#include "defines.h"
@@ -26,7 +27,8 @@ namespace latinime {
class AdditionalProximityChars {
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(AdditionalProximityChars);
- static const char *LOCALE_EN_US;
+ static const int LOCALE_EN_US_SIZE = 2;
+ static const int LOCALE_EN_US[LOCALE_EN_US_SIZE];
static const int EN_US_ADDITIONAL_A_SIZE = 4;
static const int EN_US_ADDITIONAL_A[];
static const int EN_US_ADDITIONAL_E_SIZE = 4;
@@ -38,15 +40,22 @@ class AdditionalProximityChars {
static const int EN_US_ADDITIONAL_U_SIZE = 4;
static const int EN_US_ADDITIONAL_U[];
- AK_FORCE_INLINE static bool isEnLocale(const char *localeStr) {
- const size_t LOCALE_EN_US_SIZE = strlen(LOCALE_EN_US);
- return localeStr && strlen(localeStr) >= LOCALE_EN_US_SIZE
- && strncmp(localeStr, LOCALE_EN_US, LOCALE_EN_US_SIZE) == 0;
+ AK_FORCE_INLINE static bool isEnLocale(const std::vector<int> *locale) {
+ const int NCHARS = NELEMS(LOCALE_EN_US);
+ if (locale->size() < NCHARS) {
+ return false;
+ }
+ for (int i = 0; i < NCHARS; ++i) {
+ if ((*locale)[i] != LOCALE_EN_US[i]) {
+ return false;
+ }
+ }
+ return true;
}
public:
- static int getAdditionalCharsSize(const char *const localeStr, const int c) {
- if (!isEnLocale(localeStr)) {
+ static int getAdditionalCharsSize(const std::vector<int> *locale, const int c) {
+ if (!isEnLocale(locale)) {
return 0;
}
switch (c) {
@@ -65,8 +74,8 @@ class AdditionalProximityChars {
}
}
- static const int *getAdditionalChars(const char *const localeStr, const int c) {
- if (!isEnLocale(localeStr)) {
+ static const int *getAdditionalChars(const std::vector<int> *locale, const int c) {
+ if (!isEnLocale(locale)) {
return 0;
}
switch (c) {
diff --git a/native/jni/src/suggest/core/layout/geometry_utils.h b/native/jni/src/suggest/core/layout/geometry_utils.h
index b667df68f..000fcd4a1 100644
--- a/native/jni/src/suggest/core/layout/geometry_utils.h
+++ b/native/jni/src/suggest/core/layout/geometry_utils.h
@@ -38,13 +38,15 @@ class GeometryUtils {
}
static AK_FORCE_INLINE float getAngleDiff(const float a1, const float a2) {
- const float deltaA = fabsf(a1 - a2);
- const float diff = ROUND_FLOAT_10000(deltaA);
- if (diff > M_PI_F) {
- const float normalizedDiff = 2.0f * M_PI_F - diff;
- return ROUND_FLOAT_10000(normalizedDiff);
+ static const float M_2PI_F = M_PI * 2.0f;
+ float delta = fabsf(a1 - a2);
+ if (delta > M_2PI_F) {
+ delta -= (M_2PI_F * static_cast<int>(delta / M_2PI_F));
}
- return diff;
+ if (delta > M_PI_F) {
+ delta = M_2PI_F - delta;
+ }
+ return ROUND_FLOAT_10000(delta);
}
static AK_FORCE_INLINE int getDistanceInt(const int x1, const int y1, const int x2,
diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp
index 4c75a188e..933a5e145 100644
--- a/native/jni/src/suggest/core/layout/proximity_info.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info.cpp
@@ -49,13 +49,13 @@ static AK_FORCE_INLINE void safeGetOrFillZeroFloatArrayRegion(JNIEnv *env, jfloa
}
}
-ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr,
- const int keyboardWidth, const int keyboardHeight, const int gridWidth,
- const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight,
- const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates,
- const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights,
- const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs,
- const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii)
+ProximityInfo::ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight,
+ const int gridWidth, const int gridHeight, const int mostCommonKeyWidth,
+ const int mostCommonKeyHeight, const jintArray proximityChars, const int keyCount,
+ const jintArray keyXCoordinates, const jintArray keyYCoordinates,
+ const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes,
+ const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs,
+ const jfloatArray sweetSpotRadii)
: GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth),
MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth),
NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE(1.0f +
@@ -82,13 +82,6 @@ ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr,
if (DEBUG_PROXIMITY_INFO) {
AKLOGI("Create proximity info array %d", proximityCharsLength);
}
- const jsize localeCStrUtf8Length = env->GetStringUTFLength(localeJStr);
- if (localeCStrUtf8Length >= MAX_LOCALE_STRING_LENGTH) {
- AKLOGI("Locale string length too long: length=%d", localeCStrUtf8Length);
- ASSERT(false);
- }
- memset(mLocaleStr, 0, sizeof(mLocaleStr));
- env->GetStringUTFRegion(localeJStr, 0, env->GetStringLength(localeJStr), mLocaleStr);
safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityCharsLength,
mProximityCharsArray);
safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates);
diff --git a/native/jni/src/suggest/core/layout/proximity_info.h b/native/jni/src/suggest/core/layout/proximity_info.h
index d4e453736..f7c907697 100644
--- a/native/jni/src/suggest/core/layout/proximity_info.h
+++ b/native/jni/src/suggest/core/layout/proximity_info.h
@@ -18,6 +18,7 @@
#define LATINIME_PROXIMITY_INFO_H
#include <unordered_map>
+#include <vector>
#include "defines.h"
#include "jni.h"
@@ -27,9 +28,9 @@ namespace latinime {
class ProximityInfo {
public:
- ProximityInfo(JNIEnv *env, const jstring localeJStr,
- const int keyboardWidth, const int keyboardHeight, const int gridWidth,
- const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight,
+ ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight,
+ const int gridWidth, const int gridHeight,
+ const int mostCommonKeyWidth, const int mostCommonKeyHeight,
const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates,
const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights,
const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs,
@@ -71,11 +72,11 @@ class ProximityInfo {
AK_FORCE_INLINE void initializeProximities(const int *const inputCodes,
const int *const inputXCoordinates, const int *const inputYCoordinates,
- const int inputSize, int *allInputCodes) const {
+ const int inputSize, int *allInputCodes, const std::vector<int> *locale) const {
ProximityInfoUtils::initializeProximities(inputCodes, inputXCoordinates, inputYCoordinates,
inputSize, mKeyXCoordinates, mKeyYCoordinates, mKeyWidths, mKeyHeights,
mProximityCharsArray, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH, MOST_COMMON_KEY_WIDTH,
- KEY_COUNT, mLocaleStr, &mLowerCodePointToKeyMap, allInputCodes);
+ KEY_COUNT, locale, &mLowerCodePointToKeyMap, allInputCodes);
}
AK_FORCE_INLINE int getKeyIndexOf(const int c) const {
@@ -103,9 +104,6 @@ class ProximityInfo {
const int KEYBOARD_HEIGHT;
const float KEYBOARD_HYPOTENUSE;
const bool HAS_TOUCH_POSITION_CORRECTION_DATA;
- // Assuming locale strings such as en_US, sr-Latn etc.
- static const int MAX_LOCALE_STRING_LENGTH = 10;
- char mLocaleStr[MAX_LOCALE_STRING_LENGTH];
int *mProximityCharsArray;
int mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD];
int mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD];
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.cpp b/native/jni/src/suggest/core/layout/proximity_info_state.cpp
index 91469e26d..d43a0026a 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state.cpp
+++ b/native/jni/src/suggest/core/layout/proximity_info_state.cpp
@@ -42,7 +42,7 @@ int ProximityInfoState::getPrimaryOriginalCodePointAt(const int index) const {
void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength,
const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize,
const int *const xCoordinates, const int *const yCoordinates, const int *const times,
- const int *const pointerIds, const bool isGeometric) {
+ const int *const pointerIds, const bool isGeometric, const std::vector<int> *locale) {
ASSERT(isGeometric || (inputSize < MAX_WORD_LENGTH));
mIsContinuousSuggestionPossible = (mHasBeenUpdatedByGeometricInput != isGeometric) ?
false : ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible(
@@ -66,7 +66,7 @@ void ProximityInfoState::initInputParams(const int pointerId, const float maxPoi
if (!isGeometric && pointerId == 0) {
mProximityInfo->initializeProximities(inputCodes, xCoordinates, yCoordinates,
- inputSize, mInputProximities);
+ inputSize, mInputProximities, locale);
}
///////////////////////
diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.h b/native/jni/src/suggest/core/layout/proximity_info_state.h
index e6180fe17..a2d663544 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_state.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_state.h
@@ -37,7 +37,8 @@ class ProximityInfoState {
void initInputParams(const int pointerId, const float maxPointToKeyLength,
const ProximityInfo *proximityInfo, const int *const inputCodes,
const int inputSize, const int *xCoordinates, const int *yCoordinates,
- const int *const times, const int *const pointerIds, const bool isGeometric);
+ const int *const times, const int *const pointerIds, const bool isGeometric,
+ const std::vector<int> *locale);
/////////////////////////////////////////
// Defined here //
diff --git a/native/jni/src/suggest/core/layout/proximity_info_utils.h b/native/jni/src/suggest/core/layout/proximity_info_utils.h
index 178aada2d..79d0615b8 100644
--- a/native/jni/src/suggest/core/layout/proximity_info_utils.h
+++ b/native/jni/src/suggest/core/layout/proximity_info_utils.h
@@ -19,6 +19,7 @@
#include <cmath>
#include <unordered_map>
+#include <vector>
#include "defines.h"
#include "suggest/core/layout/additional_proximity_chars.h"
@@ -51,7 +52,7 @@ class ProximityInfoUtils {
const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights,
const int *const proximityCharsArray, const int cellHeight, const int cellWidth,
const int gridWidth, const int mostCommonKeyWidth, const int keyCount,
- const char *const localeStr,
+ const std::vector<int> *locale,
const std::unordered_map<int, int> *const codeToKeyMap, int *inputProximities) {
// Initialize
// - mInputCodes
@@ -64,7 +65,7 @@ class ProximityInfoUtils {
int *proximities = &inputProximities[i * MAX_PROXIMITY_CHARS_SIZE];
calculateProximities(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights,
proximityCharsArray, cellHeight, cellWidth, gridWidth, mostCommonKeyWidth,
- keyCount, x, y, primaryKey, localeStr, codeToKeyMap, proximities);
+ keyCount, x, y, primaryKey, locale, codeToKeyMap, proximities);
}
if (DEBUG_PROXIMITY_CHARS) {
@@ -143,7 +144,7 @@ class ProximityInfoUtils {
const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights,
const int *const proximityCharsArray, const int cellHeight, const int cellWidth,
const int gridWidth, const int mostCommonKeyWidth, const int keyCount,
- const int x, const int y, const int primaryKey, const char *const localeStr,
+ const int x, const int y, const int primaryKey, const std::vector<int> *locale,
const std::unordered_map<int, int> *const codeToKeyMap, int *proximities) {
const int mostCommonKeyWidthSquare = mostCommonKeyWidth * mostCommonKeyWidth;
int insertPos = 0;
@@ -177,7 +178,7 @@ class ProximityInfoUtils {
}
}
const int additionalProximitySize =
- AdditionalProximityChars::getAdditionalCharsSize(localeStr, primaryKey);
+ AdditionalProximityChars::getAdditionalCharsSize(locale, primaryKey);
if (additionalProximitySize > 0) {
proximities[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE;
if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) {
@@ -188,7 +189,7 @@ class ProximityInfoUtils {
}
const int *additionalProximityChars =
- AdditionalProximityChars::getAdditionalChars(localeStr, primaryKey);
+ AdditionalProximityChars::getAdditionalChars(locale, primaryKey);
for (int j = 0; j < additionalProximitySize; ++j) {
const int ac = additionalProximityChars[j];
int k = 0;
diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h
index 9e75cace4..b9dda83ad 100644
--- a/native/jni/src/suggest/core/policy/scoring.h
+++ b/native/jni/src/suggest/core/policy/scoring.h
@@ -30,11 +30,13 @@ class Scoring {
public:
virtual int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
- const bool boostExactMatches) const = 0;
+ const bool boostExactMatches, const bool hasProbabilityZero) const = 0;
virtual void getMostProbableString(const DicTraverseSession *const traverseSession,
- const float languageWeight, SuggestionResults *const outSuggestionResults) const = 0;
- virtual float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession,
- DicNode *const terminals, const int size) const = 0;
+ const float weightOfLangModelVsSpatialModel,
+ SuggestionResults *const outSuggestionResults) const = 0;
+ virtual float getAdjustedWeightOfLangModelVsSpatialModel(
+ DicTraverseSession *const traverseSession, DicNode *const terminals,
+ const int size) const = 0;
virtual float getDoubleLetterDemotionDistanceCost(
const DicNode *const terminalDicNode) const = 0;
virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0;
diff --git a/native/jni/src/suggest/core/policy/traversal.h b/native/jni/src/suggest/core/policy/traversal.h
index 8ddaa0514..5b6616d9a 100644
--- a/native/jni/src/suggest/core/policy/traversal.h
+++ b/native/jni/src/suggest/core/policy/traversal.h
@@ -44,11 +44,12 @@ class Traversal {
virtual bool needsToTraverseAllUserInput() const = 0;
virtual float getMaxSpatialDistance() const = 0;
virtual int getDefaultExpandDicNodeSize() const = 0;
- virtual int getMaxCacheSize(const int inputSize) const = 0;
+ virtual int getMaxCacheSize(const int inputSize, const float weightForLocale) const = 0;
virtual int getTerminalCacheSize() const = 0;
virtual bool isPossibleOmissionChildNode(const DicTraverseSession *const traverseSession,
const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0;
- virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode) const = 0;
+ virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode,
+ const int probability) const = 0;
protected:
Traversal() {}
diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp
index c202b81fe..450203d98 100644
--- a/native/jni/src/suggest/core/policy/weighting.cpp
+++ b/native/jni/src/suggest/core/policy/weighting.cpp
@@ -110,12 +110,16 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return weighting->getOmissionCost(parentDicNode, dicNode);
case CT_ADDITIONAL_PROXIMITY:
// only used for typing
- return weighting->getAdditionalProximityCost();
+ // TODO: Quit calling getMatchedCost().
+ return weighting->getAdditionalProximityCost()
+ + weighting->getMatchedCost(traverseSession, dicNode, inputStateG);
case CT_SUBSTITUTION:
// only used for typing
- return weighting->getSubstitutionCost();
+ // TODO: Quit calling getMatchedCost().
+ return weighting->getSubstitutionCost()
+ + weighting->getMatchedCost(traverseSession, dicNode, inputStateG);
case CT_NEW_WORD_SPACE_OMISSION:
- return weighting->getNewWordSpatialCost(traverseSession, dicNode, inputStateG);
+ return weighting->getSpaceOmissionCost(traverseSession, dicNode, inputStateG);
case CT_MATCH:
return weighting->getMatchedCost(traverseSession, dicNode, inputStateG);
case CT_COMPLETION:
@@ -176,9 +180,9 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_OMISSION:
return 0;
case CT_ADDITIONAL_PROXIMITY:
- return 0; /* 0 because CT_MATCH will be called */
+ return 1;
case CT_SUBSTITUTION:
- return 0; /* 0 because CT_MATCH will be called */
+ return 1;
case CT_NEW_WORD_SPACE_OMISSION:
return 0;
case CT_MATCH:
diff --git a/native/jni/src/suggest/core/policy/weighting.h b/native/jni/src/suggest/core/policy/weighting.h
index bd6b3cf41..863c4eabe 100644
--- a/native/jni/src/suggest/core/policy/weighting.h
+++ b/native/jni/src/suggest/core/policy/weighting.h
@@ -57,7 +57,7 @@ class Weighting {
const DicTraverseSession *const traverseSession,
const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0;
- virtual float getNewWordSpatialCost(const DicTraverseSession *const traverseSession,
+ virtual float getSpaceOmissionCost(const DicTraverseSession *const traverseSession,
const DicNode *const dicNode, DicNode_InputStateG *const inputStateG) const = 0;
virtual float getNewWordBigramLanguageCost(
diff --git a/native/jni/src/suggest/core/result/suggestion_results.cpp b/native/jni/src/suggest/core/result/suggestion_results.cpp
index 4c10bd08a..3756d1092 100644
--- a/native/jni/src/suggest/core/result/suggestion_results.cpp
+++ b/native/jni/src/suggest/core/result/suggestion_results.cpp
@@ -23,7 +23,7 @@ namespace latinime {
void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCount,
jintArray outputCodePointsArray, jintArray outScoresArray, jintArray outSpaceIndicesArray,
jintArray outTypesArray, jintArray outAutoCommitFirstWordConfidenceArray,
- jfloatArray outLanguageWeight) {
+ jfloatArray outWeightOfLangModelVsSpatialModel) {
int outputIndex = 0;
while (!mSuggestedWords.empty()) {
const SuggestedWord &suggestedWord = mSuggestedWords.top();
@@ -44,7 +44,8 @@ void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCo
mSuggestedWords.pop();
}
JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, outputIndex);
- JniDataUtils::putFloatToArray(env, outLanguageWeight, 0 /* index */, mLanguageWeight);
+ JniDataUtils::putFloatToArray(env, outWeightOfLangModelVsSpatialModel, 0 /* index */,
+ mWeightOfLangModelVsSpatialModel);
}
void SuggestionResults::addPrediction(const int *const codePoints, const int codePointCount,
@@ -89,7 +90,7 @@ void SuggestionResults::getSortedScores(int *const outScores) const {
}
void SuggestionResults::dumpSuggestions() const {
- AKLOGE("language weight: %f", mLanguageWeight);
+ AKLOGE("weight of language model vs spatial model: %f", mWeightOfLangModelVsSpatialModel);
std::vector<SuggestedWord> suggestedWords;
auto copyOfSuggestedWords = mSuggestedWords;
while (!copyOfSuggestedWords.empty()) {
diff --git a/native/jni/src/suggest/core/result/suggestion_results.h b/native/jni/src/suggest/core/result/suggestion_results.h
index 8e845e2d3..738c78a9f 100644
--- a/native/jni/src/suggest/core/result/suggestion_results.h
+++ b/native/jni/src/suggest/core/result/suggestion_results.h
@@ -29,13 +29,15 @@ namespace latinime {
class SuggestionResults {
public:
explicit SuggestionResults(const int maxSuggestionCount)
- : mMaxSuggestionCount(maxSuggestionCount), mLanguageWeight(NOT_A_LANGUAGE_WEIGHT),
+ : mMaxSuggestionCount(maxSuggestionCount),
+ mWeightOfLangModelVsSpatialModel(NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL),
mSuggestedWords() {}
// Returns suggestion count.
void outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, jintArray outCodePointsArray,
jintArray outScoresArray, jintArray outSpaceIndicesArray, jintArray outTypesArray,
- jintArray outAutoCommitFirstWordConfidenceArray, jfloatArray outLanguageWeight);
+ jintArray outAutoCommitFirstWordConfidenceArray,
+ jfloatArray outWeightOfLangModelVsSpatialModel);
void addPrediction(const int *const codePoints, const int codePointCount, const int score);
void addSuggestion(const int *const codePoints, const int codePointCount,
const int score, const int type, const int indexToPartialCommit,
@@ -43,8 +45,8 @@ class SuggestionResults {
void getSortedScores(int *const outScores) const;
void dumpSuggestions() const;
- void setLanguageWeight(const float languageWeight) {
- mLanguageWeight = languageWeight;
+ void setWeightOfLangModelVsSpatialModel(const float weightOfLangModelVsSpatialModel) {
+ mWeightOfLangModelVsSpatialModel = weightOfLangModelVsSpatialModel;
}
int getSuggestionCount() const {
@@ -55,7 +57,7 @@ class SuggestionResults {
DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionResults);
const int mMaxSuggestionCount;
- float mLanguageWeight;
+ float mWeightOfLangModelVsSpatialModel;
std::priority_queue<
SuggestedWord, std::vector<SuggestedWord>, SuggestedWord::Comparator> mSuggestedWords;
};
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
index 0b99b75ec..7c37241de 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
@@ -19,9 +19,9 @@
#include <algorithm>
#include <vector>
+#include "dictionary/utils/binary_dictionary_shortcut_iterator.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_utils.h"
-#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h"
#include "suggest/core/dictionary/error_type_utils.h"
#include "suggest/core/policy/scoring.h"
#include "suggest/core/result/suggestion_results.h"
@@ -34,7 +34,8 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
/* static */ void SuggestionsOutputUtils::outputSuggestions(
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
- const float languageWeight, SuggestionResults *const outSuggestionResults) {
+ const float weightOfLangModelVsSpatialModel,
+ SuggestionResults *const outSuggestionResults) {
#if DEBUG_EVALUATE_MOST_PROBABLE_STRING
const int terminalSize = 0;
#else
@@ -44,12 +45,15 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
for (int index = terminalSize - 1; index >= 0; --index) {
traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]);
}
- // Compute a language weight when an invalid language weight is passed.
- // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight.
- const float languageWeightToOutputSuggestions = (languageWeight < 0.0f) ?
- scoringPolicy->getAdjustedLanguageWeight(
- traverseSession, terminals.data(), terminalSize) : languageWeight;
- outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions);
+ // Compute a weight of language model when an invalid weight is passed.
+ // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value.
+ const float weightOfLangModelVsSpatialModelToOutputSuggestions =
+ (weightOfLangModelVsSpatialModel < 0.0f)
+ ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession,
+ terminals.data(), terminalSize)
+ : weightOfLangModelVsSpatialModel;
+ outSuggestionResults->setWeightOfLangModelVsSpatialModel(
+ weightOfLangModelVsSpatialModelToOutputSuggestions);
// Force autocorrection for obvious long multi-word suggestions when the top suggestion is
// a long multiple words suggestion.
// TODO: Implement a smarter auto-commit method for handling multi-word suggestions.
@@ -65,16 +69,62 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
// Output suggestion results here
for (auto &terminalDicNode : terminals) {
outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode,
- languageWeightToOutputSuggestions, boostExactMatches, forceCommitMultiWords,
- outputSecondWordFirstLetterInputIndex, outSuggestionResults);
+ weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches,
+ forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults);
+ }
+ scoringPolicy->getMostProbableString(traverseSession,
+ weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
+}
+
+/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
+ const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
+ const WordAttributes wordAttributes, const bool isLastWord) {
+ const bool currentWordExactMatch =
+ ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
+ // When we have to block offensive words, non-exact matched offensive words should not be
+ // output.
+ const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
+
+ const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
+ wordAttributes.isPossiblyOffensive();
+
+ // This function is called in two situations:
+ //
+ // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
+ // of the search, and isLastWord will be true.
+ // "fuck"
+ // |
+ // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
+ // In this case, if the current word is an exact match, we will always let the word
+ // through, even if the user is blocking offensive words (it's exactly what they typed!)
+ //
+ // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
+ // to start a new search at root, to try to match the rest of the input. In this case,
+ // terminalDicNode will point to the terminal node we just hit, and isLastWord will be
+ // false.
+ // "fuckvthis"
+ // |
+ // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
+ //
+ // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
+ // when offensive words are blocked would be a bad idea).
+ //
+ // In the case of a multi-word correction where the offensive word is typed last (eg.
+ // for the input "allfuck"), this function will be called with isLastWord==true, but
+ // currentWordExactMatch==false. So we are OK in this case as well.
+ // "allfuck"
+ // |
+ // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
+ if (isLastWord && currentWordExactMatch) {
+ return false;
+ } else {
+ return isBlockedOffensiveWord;
}
- scoringPolicy->getMostProbableString(traverseSession, languageWeightToOutputSuggestions,
- outSuggestionResults);
}
/* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
- const DicNode *const terminalDicNode, const float languageWeight,
+ const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
const bool boostExactMatches, const bool forceCommitMultiWords,
const bool outputSecondWordFirstLetterInputIndex,
SuggestionResults *const outSuggestionResults) {
@@ -83,34 +133,32 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
}
const float doubleLetterCost =
scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode);
- const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight)
- + doubleLetterCost;
- const bool isPossiblyOffensiveWord =
- traverseSession->getDictionaryStructurePolicy()->getProbability(
- terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0;
+ const float compoundDistance =
+ terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel)
+ + doubleLetterCost;
+ const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy()
+ ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(),
+ terminalDicNode->getWordId(), nullptr /* multiBigramMap */);
const bool isExactMatch =
ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
const bool isExactMatchWithIntentionalOmission =
ErrorTypeUtils::isExactMatchWithIntentionalOmission(
terminalDicNode->getContainedErrorTypes());
- const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
- // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
- // (e.g. "AMD" and "and")
- const bool isSafeExactMatch = isExactMatch
- && !(isPossiblyOffensiveWord && isFirstCharUppercase);
+ // TODO: Decide whether the word should be auto-corrected or not here.
+ const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent(
+ terminalDicNode->getContainedErrorTypes());
const int outputTypeFlags =
- (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
- | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
+ (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
+ | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
| (isExactMatchWithIntentionalOmission ?
- Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
-
+ Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0)
+ | (isAppropriateForAutoCorrection ?
+ Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0);
// Entries that are blacklisted or do not represent a word should not be output.
- const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord();
- // When we have to block offensive words, non-exact matched offensive words should not be
- // output.
- const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
- const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord
- && !isSafeExactMatch;
+ const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
+
+ const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
+ terminalDicNode, wordAttributes, true /* isLastWord */);
// Increase output score of top typing suggestion to ensure autocorrection.
// TODO: Better integration with java side autocorrection logic.
@@ -118,11 +166,11 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
compoundDistance, traverseSession->getInputSize(),
terminalDicNode->getContainedErrorTypes(),
(forceCommitMultiWords && terminalDicNode->hasMultipleWords()),
- boostExactMatches);
+ boostExactMatches, wordAttributes.getProbability() == 0);
// Don't output invalid or blocked offensive words. However, we still need to submit their
// shortcuts if any.
- if (isValidWord && !isBlockedOffensiveWord) {
+ if (isValidWord && !shouldBlockThisWord) {
int codePoints[MAX_WORD_LENGTH];
terminalDicNode->outputResult(codePoints);
const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
@@ -139,10 +187,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
// Shortcut is not supported for multiple words suggestions.
// TODO: Check shortcuts during traversal for multiple words suggestions.
if (!terminalDicNode->hasMultipleWords()) {
- BinaryDictionaryShortcutIterator shortcutIt(
- traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(),
- traverseSession->getDictionaryStructurePolicy()
- ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos()));
+ BinaryDictionaryShortcutIterator shortcutIt =
+ traverseSession->getDictionaryStructurePolicy()->getShortcutIterator(
+ terminalDicNode->getWordId());
const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode);
outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults);
}
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h
index b099b4776..bcb75a483 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.h
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h
@@ -18,6 +18,7 @@
#define LATINIME_SUGGESTIONS_OUTPUT_UTILS
#include "defines.h"
+#include "dictionary/property/word_attributes.h"
namespace latinime {
@@ -25,15 +26,23 @@ class BinaryDictionaryShortcutIterator;
class DicNode;
class DicTraverseSession;
class Scoring;
+class SuggestOptions;
class SuggestionResults;
class SuggestionsOutputUtils {
public:
/**
+ * Returns true if we should block the incoming word, in the context of the user's
+ * preferences to include or not include possibly offensive words
+ */
+ static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
+ const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
+ const bool isLastWord);
+ /**
* Outputs the final list of suggestions (i.e., terminal nodes).
*/
static void outputSuggestions(const Scoring *const scoringPolicy,
- DicTraverseSession *traverseSession, const float languageWeight,
+ DicTraverseSession *traverseSession, const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults);
private:
@@ -44,7 +53,7 @@ class SuggestionsOutputUtils {
static void outputSuggestionsOfDicNode(const Scoring *const scoringPolicy,
DicTraverseSession *traverseSession, const DicNode *const terminalDicNode,
- const float languageWeight, const bool boostExactMatches,
+ const float weightOfLangModelVsSpatialModel, const bool boostExactMatches,
const bool forceCommitMultiWords, const bool outputSecondWordFirstLetterInputIndex,
SuggestionResults *const outSuggestionResults);
static void outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt,
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
index f1e411f38..d7dd5a02d 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
@@ -17,10 +17,10 @@
#include "suggest/core/session/dic_traverse_session.h"
#include "defines.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/property/ngram_context.h"
#include "suggest/core/dictionary/dictionary.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "suggest/core/session/prev_words_info.h"
namespace latinime {
@@ -30,13 +30,13 @@ const int DicTraverseSession::DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_S
256 * 1024;
void DicTraverseSession::init(const Dictionary *const dictionary,
- const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions) {
+ const NgramContext *const ngramContext, const SuggestOptions *const suggestOptions) {
mDictionary = dictionary;
mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy()
->getMultiWordCostMultiplier();
mSuggestOptions = suggestOptions;
- prevWordsInfo->getPrevWordsTerminalPtNodePos(
- getDictionaryStructurePolicy(), mPrevWordsPtNodePos, true /* tryLowerCaseSearch */);
+ mPrevWordIdCount = ngramContext->getPrevWordIds(getDictionaryStructurePolicy(),
+ &mPrevWordIdArray, true /* tryLowerCaseSearch */).size();
}
void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo,
@@ -69,8 +69,12 @@ void DicTraverseSession::initializeProximityInfoStates(const int *const inputCod
for (int i = 0; i < maxPointerCount; ++i) {
mProximityInfoStates[i].initInputParams(i, maxSpatialDistance, getProximityInfo(),
inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds,
- maxPointerCount == MAX_POINTER_COUNT_G
- /* TODO: this is a hack. fix proximity info state */);
+ // Right now the line below is trying to figure out whether this is a gesture by
+ // looking at the pointer count and assuming whatever is above the cutoff is
+ // a gesture and whatever is below is type. This is hacky and incorrect, we
+ // should pass the correct information instead.
+ maxPointerCount == MAX_POINTER_COUNT_G,
+ getDictionaryStructurePolicy()->getHeaderStructurePolicy()->getLocale());
mInputSize += mProximityInfoStates[i].size();
}
}
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h
index 5a51a112d..f5fcfddcd 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.h
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.h
@@ -20,16 +20,17 @@
#include <vector>
#include "defines.h"
+#include "dictionary/utils/multi_bigram_map.h"
#include "jni.h"
#include "suggest/core/dicnode/dic_nodes_cache.h"
-#include "suggest/core/dictionary/multi_bigram_map.h"
#include "suggest/core/layout/proximity_info_state.h"
+#include "utils/int_array_view.h"
namespace latinime {
class Dictionary;
class DictionaryStructureWithBufferPolicy;
-class PrevWordsInfo;
+class NgramContext;
class ProximityInfo;
class SuggestOptions;
@@ -50,20 +51,17 @@ class DicTraverseSession {
}
AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache)
- : mProximityInfo(nullptr), mDictionary(nullptr), mSuggestOptions(nullptr),
- mDicNodesCache(usesLargeCache), mMultiBigramMap(), mInputSize(0), mMaxPointerCount(1),
- mMultiWordCostMultiplier(1.0f) {
+ : mPrevWordIdCount(0), mProximityInfo(nullptr), mDictionary(nullptr),
+ mSuggestOptions(nullptr), mDicNodesCache(usesLargeCache), mMultiBigramMap(),
+ mInputSize(0), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) {
// NOTE: mProximityInfoStates is an array of instances.
// No need to initialize it explicitly here.
- for (size_t i = 0; i < NELEMS(mPrevWordsPtNodePos); ++i) {
- mPrevWordsPtNodePos[i] = NOT_A_DICT_POS;
- }
}
// Non virtual inline destructor -- never inherit this class
AK_FORCE_INLINE ~DicTraverseSession() {}
- void init(const Dictionary *dictionary, const PrevWordsInfo *const prevWordsInfo,
+ void init(const Dictionary *dictionary, const NgramContext *const ngramContext,
const SuggestOptions *const suggestOptions);
// TODO: Remove and merge into init
void setupForGetSuggestions(const ProximityInfo *pInfo, const int *inputCodePoints,
@@ -79,7 +77,9 @@ class DicTraverseSession {
//--------------------
const ProximityInfo *getProximityInfo() const { return mProximityInfo; }
const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; }
- const int *getPrevWordsPtNodePos() const { return mPrevWordsPtNodePos; }
+ const WordIdArrayView getPrevWordIds() const {
+ return WordIdArrayView::fromArray(mPrevWordIdArray).limit(mPrevWordIdCount);
+ }
DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; }
MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; }
const ProximityInfoState *getProximityInfoState(int id) const {
@@ -166,7 +166,8 @@ class DicTraverseSession {
const int *const inputYs, const int *const times, const int *const pointerIds,
const int inputSize, const float maxSpatialDistance, const int maxPointerCount);
- int mPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordIdArray;
+ size_t mPrevWordIdCount;
const ProximityInfo *mProximityInfo;
const Dictionary *mDictionary;
const SuggestOptions *mSuggestOptions;
diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h
deleted file mode 100644
index e44e876e9..000000000
--- a/native/jni/src/suggest/core/session/prev_words_info.h
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_PREV_WORDS_INFO_H
-#define LATINIME_PREV_WORDS_INFO_H
-
-#include "defines.h"
-#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
-#include "utils/char_utils.h"
-
-namespace latinime {
-
-// TODO: Support n-gram.
-class PrevWordsInfo {
- public:
- // No prev word information.
- PrevWordsInfo() {
- clear();
- }
-
- PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) {
- for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
- mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i];
- memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i],
- sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
- mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i];
- }
- }
-
- // Construct from previous words.
- PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH],
- const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
- const size_t prevWordCount) {
- clear();
- for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) {
- if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
- continue;
- }
- memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
- sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
- mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
- mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
- }
- }
-
- // Construct from a previous word.
- PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount,
- const bool isBeginningOfSentence) {
- clear();
- if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
- return;
- }
- memmove(mPrevWordCodePoints[0], prevWordCodePoints,
- sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
- mPrevWordCodePointCount[0] = prevWordCodePointCount;
- mIsBeginningOfSentence[0] = isBeginningOfSentence;
- }
-
- bool isValid() const {
- if (mPrevWordCodePointCount[0] > 0) {
- return true;
- }
- if (mIsBeginningOfSentence[0]) {
- return true;
- }
- return false;
- }
-
- void getPrevWordsTerminalPtNodePos(
- const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
- int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const {
- for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
- outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy,
- mPrevWordCodePoints[i], mPrevWordCodePointCount[i],
- mIsBeginningOfSentence[i], tryLowerCaseSearch);
- }
- }
-
- // n is 1-indexed.
- const int *getNthPrevWordCodePoints(const int n) const {
- if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
- return nullptr;
- }
- return mPrevWordCodePoints[n - 1];
- }
-
- // n is 1-indexed.
- int getNthPrevWordCodePointCount(const int n) const {
- if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
- return 0;
- }
- return mPrevWordCodePointCount[n - 1];
- }
-
- // n is 1-indexed.
- bool isNthPrevWordBeginningOfSentence(const int n) const {
- if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) {
- return false;
- }
- return mIsBeginningOfSentence[n - 1];
- }
-
- private:
- DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo);
-
- static int getTerminalPtNodePosOfWord(
- const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
- const int *const wordCodePoints, const int wordCodePointCount,
- const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
- if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
- return NOT_A_DICT_POS;
- }
- int codePoints[MAX_WORD_LENGTH];
- int codePointCount = wordCodePointCount;
- memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
- if (isBeginningOfSentence) {
- codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints,
- codePointCount, MAX_WORD_LENGTH);
- if (codePointCount <= 0) {
- return NOT_A_DICT_POS;
- }
- }
- const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord(
- codePoints, codePointCount, false /* forceLowerCaseSearch */);
- if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) {
- // Return the position when when the word was found or doesn't try lower case
- // search.
- return wordPtNodePos;
- }
- // Check bigrams for lower-cased previous word if original was not found. Useful for
- // auto-capitalized words like "The [current_word]".
- return dictStructurePolicy->getTerminalPtNodePositionOfWord(
- codePoints, codePointCount, true /* forceLowerCaseSearch */);
- }
-
- void clear() {
- for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
- mPrevWordCodePointCount[i] = 0;
- mIsBeginningOfSentence[i] = false;
- }
- }
-
- int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
- int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
-};
-} // namespace latinime
-#endif // LATINIME_PREV_WORDS_INFO_H
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 0cd305f5a..52fa5a5db 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -16,17 +16,20 @@
#include "suggest/core/suggest.h"
+#include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
+#include "dictionary/property/word_attributes.h"
#include "suggest/core/dicnode/dic_node.h"
#include "suggest/core/dicnode/dic_node_priority_queue.h"
#include "suggest/core/dicnode/dic_node_vector.h"
#include "suggest/core/dictionary/dictionary.h"
#include "suggest/core/dictionary/digraph_utils.h"
#include "suggest/core/layout/proximity_info.h"
-#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h"
#include "suggest/core/policy/traversal.h"
#include "suggest/core/policy/weighting.h"
#include "suggest/core/result/suggestions_output_utils.h"
#include "suggest/core/session/dic_traverse_session.h"
+#include "suggest/core/suggest_options.h"
+#include "utils/profiler.h"
namespace latinime {
@@ -44,10 +47,10 @@ const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;
*/
void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession,
int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints,
- int inputSize, const float languageWeight,
+ int inputSize, const float weightOfLangModelVsSpatialModel,
SuggestionResults *const outSuggestionResults) const {
- PROF_OPEN;
- PROF_START(0);
+ PROF_INIT;
+ PROF_TIMER_START(0);
const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance();
DicTraverseSession *tSession = static_cast<DicTraverseSession *>(traverseSession);
tSession->setupForGetSuggestions(pInfo, inputCodePoints, inputSize, inputXs, inputYs, times,
@@ -55,8 +58,8 @@ void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession,
// TODO: Add the way to evaluate cache
initializeSearch(tSession);
- PROF_END(0);
- PROF_START(1);
+ PROF_TIMER_END(0);
+ PROF_TIMER_START(1);
// keep expanding search dicNodes until all have terminated.
while (tSession->getDicTraverseCache()->activeSize() > 0) {
@@ -64,12 +67,11 @@ void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession,
tSession->getDicTraverseCache()->advanceActiveDicNodes();
tSession->getDicTraverseCache()->advanceInputIndex(inputSize);
}
- PROF_END(1);
- PROF_START(2);
+ PROF_TIMER_END(1);
+ PROF_TIMER_START(2);
SuggestionsOutputUtils::outputSuggestions(
- SCORING, tSession, languageWeight, outSuggestionResults);
- PROF_END(2);
- PROF_CLOSE;
+ SCORING, tSession, weightOfLangModelVsSpatialModel, outSuggestionResults);
+ PROF_TIMER_END(2);
}
/**
@@ -87,12 +89,13 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession) const {
traverseSession->getDicTraverseCache()->continueSearch();
} else {
// Restart recognition at the root.
- traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize()),
+ traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize(),
+ traverseSession->getSuggestOptions()->weightForLocale()),
TRAVERSAL->getTerminalCacheSize());
// Create a new dic node here
DicNode rootNode;
DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(),
- traverseSession->getPrevWordsPtNodePos(), &rootNode);
+ traverseSession->getPrevWordIds(), &rootNode);
traverseSession->getDicTraverseCache()->copyPushActive(&rootNode);
}
}
@@ -157,8 +160,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const {
// TODO: Remove. Do not prune node here.
const bool allowsErrorCorrections = TRAVERSAL->allowsErrorCorrections(&dicNode);
// Process for handling space substitution (e.g., hevis => he is)
- if (allowsErrorCorrections
- && TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) {
+ if (TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) {
createNextWordDicNode(traverseSession, &dicNode, true /* spaceSubstitution */);
}
@@ -281,7 +283,6 @@ void Suggest::processDicNodeAsAdditionalProximityChar(DicTraverseSession *traver
// not treat the node as a terminal. There is no need to pass the bigram map in these cases.
Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY,
traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */);
- weightChildNode(traverseSession, childDicNode);
processExpandedDicNode(traverseSession, childDicNode);
}
@@ -289,7 +290,6 @@ void Suggest::processDicNodeAsSubstitution(DicTraverseSession *traverseSession,
DicNode *dicNode, DicNode *childDicNode) const {
Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION, traverseSession,
dicNode, childDicNode, 0 /* multiBigramMap */);
- weightChildNode(traverseSession, childDicNode);
processExpandedDicNode(traverseSession, childDicNode);
}
@@ -400,7 +400,7 @@ void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicN
if (dicNode->isCompletion(inputSize)) {
Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_COMPLETION, traverseSession,
0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */);
- } else { // completion
+ } else {
Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession,
0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */);
}
@@ -412,7 +412,16 @@ void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicN
*/
void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode,
const bool spaceSubstitution) const {
- if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode)) {
+ const WordAttributes wordAttributes =
+ traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
+ dicNode->getPrevWordIds(), dicNode->getWordId(),
+ traverseSession->getMultiBigramMap());
+ if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
+ dicNode, wordAttributes, false /* isLastWord */)) {
+ return;
+ }
+
+ if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
return;
}
diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h
index 788e0314b..65d5918cf 100644
--- a/native/jni/src/suggest/core/suggest.h
+++ b/native/jni/src/suggest/core/suggest.h
@@ -49,7 +49,8 @@ class Suggest : public SuggestInterface {
AK_FORCE_INLINE virtual ~Suggest() {}
void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs,
int *times, int *pointerIds, int *inputCodePoints, int inputSize,
- const float languageWeight, SuggestionResults *const outSuggestionResults) const;
+ const float weightOfLangModelVsSpatialModel,
+ SuggestionResults *const outSuggestionResults) const;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest);
diff --git a/native/jni/src/suggest/core/suggest_interface.h b/native/jni/src/suggest/core/suggest_interface.h
index a6e5aefae..a05aa9c80 100644
--- a/native/jni/src/suggest/core/suggest_interface.h
+++ b/native/jni/src/suggest/core/suggest_interface.h
@@ -28,7 +28,8 @@ class SuggestInterface {
public:
virtual void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs,
int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize,
- const float languageWeight, SuggestionResults *const suggestionResults) const = 0;
+ const float weightOfLangModelVsSpatialModel,
+ SuggestionResults *const suggestionResults) const = 0;
SuggestInterface() {}
virtual ~SuggestInterface() {}
private:
diff --git a/native/jni/src/suggest/core/suggest_options.h b/native/jni/src/suggest/core/suggest_options.h
index d456680dd..4d331292b 100644
--- a/native/jni/src/suggest/core/suggest_options.h
+++ b/native/jni/src/suggest/core/suggest_options.h
@@ -42,6 +42,12 @@ class SuggestOptions{
return getBoolOption(SPACE_AWARE_GESTURE_ENABLED);
}
+ AK_FORCE_INLINE float weightForLocale() const {
+ // The weight is in thousands and we want the real value, so we divide by 1000.
+ // NativeSuggestOptions#setWeightForLocale does the opposite processing in Java.
+ return static_cast<float>(getIntOption(WEIGHT_FOR_LOCALE_IN_THOUSANDS)) / 1000.0f;
+ }
+
AK_FORCE_INLINE bool getAdditionalFeaturesBoolOption(const int key) const {
return getBoolOption(key + ADDITIONAL_FEATURES_OPTIONS);
}
@@ -55,9 +61,10 @@ class SuggestOptions{
static const int USE_FULL_EDIT_DISTANCE = 1;
static const int BLOCK_OFFENSIVE_WORDS = 2;
static const int SPACE_AWARE_GESTURE_ENABLED = 3;
+ static const int WEIGHT_FOR_LOCALE_IN_THOUSANDS = 4;
// Additional features options are stored after the other options and used as setting values of
// experimental features.
- static const int ADDITIONAL_FEATURES_OPTIONS = 4;
+ static const int ADDITIONAL_FEATURES_OPTIONS = 5;
const int *const mOptions;
const int mLength;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp
deleted file mode 100644
index 08dc107ab..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
-
-#include "suggest/core/dictionary/property/bigram_property.h"
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
-
-namespace latinime {
-
-void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability,
- bool *const outHasNext, int *const bigramEntryPos) const {
- const BigramEntry bigramEntry =
- mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos);
- if (outBigramPos) {
- // Lookup target PtNode position.
- *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
- bigramEntry.getTargetTerminalId());
- }
- if (outProbability) {
- if (bigramEntry.hasHistoricalInfo()) {
- *outProbability =
- ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(),
- mHeaderPolicy);
- } else {
- *outProbability = bigramEntry.getProbability();
- }
- }
- if (outHasNext) {
- *outHasNext = bigramEntry.hasNext();
- }
-}
-
-bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) {
- // 1. The word has no bigrams yet.
- // 2. The word has bigrams, and there is the target in the list.
- // 3. The word has bigrams, and there is an invalid entry that can be reclaimed.
- // 4. The word has bigrams. We have to append new bigram entry to the list.
- // 5. Same as 4, but the list is the last entry of the content file.
- if (outAddedNewEntry) {
- *outAddedNewEntry = false;
- }
- const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
- if (bigramListPos == NOT_A_DICT_POS) {
- // Case 1. PtNode that doesn't have a bigram list.
- // Create new bigram list.
- if (!mBigramDictContent->createNewBigramList(terminalId)) {
- return false;
- }
- const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
- newTargetTerminalId);
- const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry,
- bigramProperty);
- // Write an entry.
- int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId);
- if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite,
- &writingPos)) {
- AKLOGE("Cannot write bigram entry. pos: %d.", writingPos);
- return false;
- }
- if (!mBigramDictContent->writeTerminator(writingPos)) {
- AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos);
- return false;
- }
- if (outAddedNewEntry) {
- *outAddedNewEntry = true;
- }
- return true;
- }
-
- int tailEntryPos = NOT_A_DICT_POS;
- const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos,
- &tailEntryPos);
- if (entryPosToUpdate == NOT_A_DICT_POS) {
- // Case 4, 5. Add new entry to the bigram list.
- const int contentTailPos = mBigramDictContent->getContentTailPos();
- // If the tail entry is at the tail of content buffer, the new entry can be written without
- // link (Case 5).
- const bool canAppendEntry =
- contentTailPos == tailEntryPos + mBigramDictContent->getBigramEntrySize();
- const int newEntryPos = canAppendEntry ? tailEntryPos : contentTailPos;
- int writingPos = newEntryPos;
- // Write new entry at the tail position of the bigram content.
- const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
- newTargetTerminalId);
- const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
- &newBigramEntry, bigramProperty);
- if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite,
- &writingPos)) {
- AKLOGE("Cannot write bigram entry. pos: %d.", writingPos);
- return false;
- }
- if (!mBigramDictContent->writeTerminator(writingPos)) {
- AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos);
- return false;
- }
- if (!canAppendEntry) {
- // Update link of the current tail entry.
- if (!mBigramDictContent->writeLink(newEntryPos, tailEntryPos)) {
- AKLOGE("Cannot update bigram entry link. pos: %d, linked entry pos: %d.",
- tailEntryPos, newEntryPos);
- return false;
- }
- }
- if (outAddedNewEntry) {
- *outAddedNewEntry = true;
- }
- return true;
- }
-
- // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry.
- const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate);
- if (!originalBigramEntry.isValid()) {
- // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing
- // entry is updated.
- if (outAddedNewEntry) {
- *outAddedNewEntry = true;
- }
- }
- const BigramEntry updatedBigramEntry =
- originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId);
- const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(
- &updatedBigramEntry, bigramProperty);
- return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate);
-}
-
-bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) {
- const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
- if (bigramListPos == NOT_A_DICT_POS) {
- // Bigram list doesn't exist.
- return false;
- }
- const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos,
- nullptr /* outTailEntryPos */);
- if (entryPosToUpdate == NOT_A_DICT_POS) {
- // Bigram entry doesn't exist.
- return false;
- }
- const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate);
- if (targetTerminalId != bigramEntry.getTargetTerminalId()) {
- // Bigram entry doesn't exist.
- return false;
- }
- // Remove bigram entry by marking it as invalid entry and overwriting the original entry.
- const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
- return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate);
-}
-
-bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
- int *const outBigramCount) {
- const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
- if (bigramListPos == NOT_A_DICT_POS) {
- // Bigram list doesn't exist.
- return true;
- }
- bool hasNext = true;
- int readingPos = bigramListPos;
- while (hasNext) {
- const BigramEntry bigramEntry =
- mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- const int entryPos = readingPos - mBigramDictContent->getBigramEntrySize();
- hasNext = bigramEntry.hasNext();
- if (!bigramEntry.isValid()) {
- continue;
- }
- const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition(
- bigramEntry.getTargetTerminalId());
- if (targetPtNodePos == NOT_A_DICT_POS) {
- // Invalidate bigram entry.
- const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
- if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
- return false;
- }
- } else if (bigramEntry.hasHistoricalInfo()) {
- const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave(
- bigramEntry.getHistoricalInfo(), mHeaderPolicy);
- if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) {
- const BigramEntry updatedBigramEntry =
- bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo);
- if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
- return false;
- }
- *outBigramCount += 1;
- } else {
- // Remove entry.
- const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry();
- if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) {
- return false;
- }
- }
- } else {
- *outBigramCount += 1;
- }
- }
- return true;
-}
-
-int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) {
- const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId);
- if (bigramListPos == NOT_A_DICT_POS) {
- // Bigram list doesn't exist.
- return 0;
- }
- int bigramCount = 0;
- bool hasNext = true;
- int readingPos = bigramListPos;
- while (hasNext) {
- const BigramEntry bigramEntry =
- mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- hasNext = bigramEntry.hasNext();
- if (bigramEntry.isValid()) {
- bigramCount++;
- }
- }
- return bigramCount;
-}
-
-int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind,
- const int bigramListPos, int *const outTailEntryPos) const {
- if (outTailEntryPos) {
- *outTailEntryPos = NOT_A_DICT_POS;
- }
- int invalidEntryPos = NOT_A_DICT_POS;
- int readingPos = bigramListPos;
- while (true) {
- const BigramEntry bigramEntry =
- mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- const int entryPos = readingPos - mBigramDictContent->getBigramEntrySize();
- if (!bigramEntry.hasNext()) {
- if (outTailEntryPos) {
- *outTailEntryPos = entryPos;
- }
- break;
- }
- if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) {
- // Entry with same target is found.
- return entryPos;
- } else if (!bigramEntry.isValid()) {
- // Invalid entry that can be reused is found.
- invalidEntryPos = entryPos;
- }
- }
- return invalidEntryPos;
-}
-
-const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom(
- const BigramEntry *const originalBigramEntry,
- const BigramProperty *const bigramProperty) const {
- // TODO: Consolidate historical info and probability.
- if (mHeaderPolicy->hasHistoricalInfoOfWords()) {
- const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(),
- bigramProperty->getLevel(), bigramProperty->getCount());
- const HistoricalInfo updatedHistoricalInfo =
- ForgettingCurveUtils::createUpdatedHistoricalInfo(
- originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(),
- &historicalInfoForUpdate, mHeaderPolicy);
- return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo);
- } else {
- return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability());
- }
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h
deleted file mode 100644
index 4b3bb3725..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_VER4_BIGRAM_LIST_POLICY_H
-#define LATINIME_VER4_BIGRAM_LIST_POLICY_H
-
-#include "defines.h"
-#include "suggest/core/policy/dictionary_bigrams_structure_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h"
-
-namespace latinime {
-
-class BigramDictContent;
-class BigramProperty;
-class HeaderPolicy;
-class TerminalPositionLookupTable;
-
-class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy {
- public:
- Ver4BigramListPolicy(BigramDictContent *const bigramDictContent,
- const TerminalPositionLookupTable *const terminalPositionLookupTable,
- const HeaderPolicy *const headerPolicy)
- : mBigramDictContent(bigramDictContent),
- mTerminalPositionLookupTable(terminalPositionLookupTable),
- mHeaderPolicy(headerPolicy) {}
-
- void getNextBigram(int *const outBigramPos, int *const outProbability,
- bool *const outHasNext, int *const bigramEntryPos) const;
-
- bool skipAllBigrams(int *const pos) const {
- // Do nothing because we don't need to skip bigram lists in ver4 dictionaries.
- return true;
- }
-
- bool addNewEntry(const int terminalId, const int newTargetTerminalId,
- const BigramProperty *const bigramProperty, bool *const outAddedNewEntry);
-
- bool removeEntry(const int terminalId, const int targetTerminalId);
-
- bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId,
- int *const outBigramCount);
-
- int getBigramEntryConut(const int terminalId);
-
- private:
- DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy);
-
- int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos,
- int *const outTailEntryPos) const;
-
- const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry,
- const BigramProperty *const bigramProperty) const;
-
- BigramDictContent *const mBigramDictContent;
- const TerminalPositionLookupTable *const mTerminalPositionLookupTable;
- const HeaderPolicy *const mHeaderPolicy;
-};
-} // namespace latinime
-#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
deleted file mode 100644
index d7e1952b5..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h"
-
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-
-namespace latinime {
-
-const int BigramDictContent::INVALID_LINKED_ENTRY_POS = Ver4DictConstants::NOT_A_TERMINAL_ID;
-
-const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition(
- int *const bigramEntryPos) const {
- const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer();
- const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize();
- if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) {
- AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, "
- "bufSize: %d", *bigramEntryPos, bigramEntryTailPos,
- bigramListBuffer->getTailPosition());
- ASSERT(false);
- return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
- Ver4DictConstants::NOT_A_TERMINAL_ID);
- }
- const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos);
- const bool isLink = (bigramFlags & Ver4DictConstants::BIGRAM_IS_LINK_MASK) != 0;
- int probability = NOT_A_PROBABILITY;
- int timestamp = NOT_A_TIMESTAMP;
- int level = 0;
- int count = 0;
- if (mHasHistoricalInfo) {
- timestamp = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos);
- level = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos);
- count = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos);
- } else {
- probability = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos);
- }
- const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition(
- Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos);
- const int targetTerminalId =
- (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ?
- Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId;
- if (isLink) {
- const int linkedEntryPos = targetTerminalId;
- if (linkedEntryPos == INVALID_LINKED_ENTRY_POS) {
- // Bigram list terminator is found.
- return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY,
- Ver4DictConstants::NOT_A_TERMINAL_ID);
- }
- *bigramEntryPos = linkedEntryPos;
- return getBigramEntryAndAdvancePosition(bigramEntryPos);
- }
- // hasNext is always true because we should continue to read the next entry until the terminator
- // is found.
- if (mHasHistoricalInfo) {
- const HistoricalInfo historicalInfo(timestamp, level, count);
- return BigramEntry(true /* hasNext */, probability, &historicalInfo, targetTerminalId);
- } else {
- return BigramEntry(true /* hasNext */, probability, targetTerminalId);
- }
-}
-
-bool BigramDictContent::writeBigramEntryAndAdvancePosition(
- const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) {
- return writeBigramEntryAttributesAndAdvancePosition(false /* isLink */,
- bigramEntryToWrite->getProbability(), bigramEntryToWrite->getTargetTerminalId(),
- bigramEntryToWrite->getHistoricalInfo()->getTimeStamp(),
- bigramEntryToWrite->getHistoricalInfo()->getLevel(),
- bigramEntryToWrite->getHistoricalInfo()->getCount(),
- entryWritingPos);
-}
-
-bool BigramDictContent::writeBigramEntryAttributesAndAdvancePosition(
- const bool isLink, const int probability, const int targetTerminalId,
- const int timestamp, const int level, const int count, int *const entryWritingPos) {
- BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer();
- const int bigramFlags = isLink ? Ver4DictConstants::BIGRAM_IS_LINK_MASK : 0;
- if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags,
- Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags);
- return false;
- }
- if (mHasHistoricalInfo) {
- if (!bigramListBuffer->writeUintAndAdvancePosition(timestamp,
- Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos,
- timestamp);
- return false;
- }
- if (!bigramListBuffer->writeUintAndAdvancePosition(level,
- Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos,
- level);
- return false;
- }
- if (!bigramListBuffer->writeUintAndAdvancePosition(count,
- Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos,
- count);
- return false;
- }
- } else {
- if (!bigramListBuffer->writeUintAndAdvancePosition(probability,
- Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos,
- probability);
- return false;
- }
- }
- const int targetTerminalIdToWrite = (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) ?
- Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : targetTerminalId;
- if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite,
- Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) {
- AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d",
- *entryWritingPos, targetTerminalId);
- return false;
- }
- return true;
-}
-
-bool BigramDictContent::writeLink(const int linkedEntryPos, const int writingPos) {
- const int targetTerminalId = linkedEntryPos;
- int pos = writingPos;
- return writeBigramEntryAttributesAndAdvancePosition(true /* isLink */,
- NOT_A_PROBABILITY /* probability */, targetTerminalId, NOT_A_TIMESTAMP, 0 /* level */,
- 0 /* count */, &pos);
-}
-
-bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const BigramDictContent *const originalBigramDictContent,
- int *const outBigramEntryCount) {
- for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin();
- it != terminalIdMap->end(); ++it) {
- const int originalBigramListPos =
- originalBigramDictContent->getBigramListHeadPos(it->first);
- if (originalBigramListPos == NOT_A_DICT_POS) {
- // This terminal does not have a bigram list.
- continue;
- }
- const int bigramListPos = getContentBuffer()->getTailPosition();
- int bigramEntryCount = 0;
- // Copy bigram list with GC from original content.
- if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos,
- terminalIdMap, &bigramEntryCount)) {
- AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d",
- originalBigramListPos, bigramListPos);
- return false;
- }
- if (bigramEntryCount == 0) {
- // All bigram entries are useless. This terminal does not have a bigram list.
- continue;
- }
- *outBigramEntryCount += bigramEntryCount;
- // Set bigram list position to the lookup table.
- if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) {
- AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d",
- it->second, bigramListPos);
- return false;
- }
- }
- return true;
-}
-
-// Returns whether GC for the bigram list was succeeded or not.
-bool BigramDictContent::runGCBigramList(const int bigramListPos,
- const BigramDictContent *const sourceBigramDictContent, const int toPos,
- const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- int *const outEntryCount) {
- bool hasNext = true;
- int readingPos = bigramListPos;
- int writingPos = toPos;
- while (hasNext) {
- const BigramEntry originalBigramEntry =
- sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- hasNext = originalBigramEntry.hasNext();
- if (!originalBigramEntry.isValid()) {
- continue;
- }
- TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
- terminalIdMap->find(originalBigramEntry.getTargetTerminalId());
- if (it == terminalIdMap->end()) {
- // Target word has been removed.
- continue;
- }
- const BigramEntry updatedBigramEntry =
- originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second);
- if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) {
- AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos);
- return false;
- }
- *outEntryCount += 1;
- }
- if (*outEntryCount > 0) {
- if (!writeTerminator(writingPos)) {
- AKLOGE("Cannot write terminator to run GC. pos: %d", writingPos);
- return false;
- }
- }
- return true;
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h
deleted file mode 100644
index 361dd2c74..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_BIGRAM_DICT_CONTENT_H
-#define LATINIME_BIGRAM_DICT_CONTENT_H
-
-#include <cstdint>
-#include <cstdio>
-
-#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-
-namespace latinime {
-
-class BigramDictContent : public SparseTableDictContent {
- public:
- BigramDictContent(uint8_t *const *buffers, const int *bufferSizes, const bool hasHistoricalInfo)
- : SparseTableDictContent(buffers, bufferSizes,
- Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE,
- Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE),
- mHasHistoricalInfo(hasHistoricalInfo) {}
-
- BigramDictContent(const bool hasHistoricalInfo)
- : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE,
- Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE),
- mHasHistoricalInfo(hasHistoricalInfo) {}
-
- int getContentTailPos() const {
- return getContentBuffer()->getTailPosition();
- }
-
- const BigramEntry getBigramEntry(const int bigramEntryPos) const {
- int readingPos = bigramEntryPos;
- return getBigramEntryAndAdvancePosition(&readingPos);
- }
-
- const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const;
-
- // Returns head position of bigram list for a PtNode specified by terminalId.
- int getBigramListHeadPos(const int terminalId) const {
- const SparseTable *const addressLookupTable = getAddressLookupTable();
- if (!addressLookupTable->contains(terminalId)) {
- return NOT_A_DICT_POS;
- }
- return addressLookupTable->get(terminalId);
- }
-
- bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) {
- int writingPos = getContentBuffer()->getTailPosition();
- return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos);
- }
-
- bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) {
- int writingPos = entryWritingPos;
- return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos);
- }
-
- bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite,
- int *const entryWritingPos);
-
- bool writeTerminator(const int writingPos) {
- // Terminator is a link to the invalid position.
- return writeLink(INVALID_LINKED_ENTRY_POS, writingPos);
- }
-
- bool writeLink(const int linkedPos, const int writingPos);
-
- bool createNewBigramList(const int terminalId) {
- const int bigramListPos = getContentBuffer()->getTailPosition();
- return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos);
- }
-
- bool flushToFile(FILE *const file) const {
- return flush(file);
- }
-
- bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const BigramDictContent *const originalBigramDictContent,
- int *const outBigramEntryCount);
-
- int getBigramEntrySize() const {
- if (mHasHistoricalInfo) {
- return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE
- + Ver4DictConstants::TIME_STAMP_FIELD_SIZE
- + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE
- + Ver4DictConstants::WORD_COUNT_FIELD_SIZE
- + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
- } else {
- return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE
- + Ver4DictConstants::PROBABILITY_SIZE
- + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE;
- }
- }
-
- private:
- DISALLOW_COPY_AND_ASSIGN(BigramDictContent);
-
- static const int INVALID_LINKED_ENTRY_POS;
-
- bool writeBigramEntryAttributesAndAdvancePosition(
- const bool isLink, const int probability, const int targetTerminalId,
- const int timestamp, const int level, const int count, int *const entryWritingPos);
-
- bool runGCBigramList(const int bigramListPos,
- const BigramDictContent *const sourceBigramDictContent, const int toPos,
- const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- int *const outEntryCount);
-
- bool mHasHistoricalInfo;
-};
-} // namespace latinime
-#endif /* LATINIME_BIGRAM_DICT_CONTENT_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h
deleted file mode 100644
index 2b0cbd93b..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_BIGRAM_ENTRY_H
-#define LATINIME_BIGRAM_ENTRY_H
-
-#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/historical_info.h"
-
-namespace latinime {
-
-class BigramEntry {
- public:
- BigramEntry(const BigramEntry& bigramEntry)
- : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability),
- mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {}
-
- // Entry with historical information.
- BigramEntry(const bool hasNext, const int probability, const int targetTerminalId)
- : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(),
- mTargetTerminalId(targetTerminalId) {}
-
- // Entry with historical information.
- BigramEntry(const bool hasNext, const int probability,
- const HistoricalInfo *const historicalInfo, const int targetTerminalId)
- : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo),
- mTargetTerminalId(targetTerminalId) {}
-
- const BigramEntry getInvalidatedEntry() const {
- return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID);
- }
-
- const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const {
- return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId);
- }
-
- const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const {
- return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId);
- }
-
- const BigramEntry updateProbabilityAndGetEntry(const int probability) const {
- return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId);
- }
-
- const BigramEntry updateHistoricalInfoAndGetEntry(
- const HistoricalInfo *const historicalInfo) const {
- return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId);
- }
-
- bool isValid() const {
- return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID;
- }
-
- bool hasNext() const {
- return mHasNext;
- }
-
- int getProbability() const {
- return mProbability;
- }
-
- bool hasHistoricalInfo() const {
- return mHistoricalInfo.isValid();
- }
-
- const HistoricalInfo *getHistoricalInfo() const {
- return &mHistoricalInfo;
- }
-
- int getTargetTerminalId() const {
- return mTargetTerminalId;
- }
-
- private:
- // Copy constructor is public to use this class as a type of return value.
- DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry);
- DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry);
-
- const bool mHasNext;
- const int mProbability;
- const HistoricalInfo mHistoricalInfo;
- const int mTargetTerminalId;
-};
-} // namespace latinime
-#endif /* LATINIME_BIGRAM_ENTRY_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
deleted file mode 100644
index 5dc91ba10..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (C) 2014, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h"
-
-namespace latinime {
-
-bool LanguageModelDictContent::save(FILE *const file) const {
- return mTrieMap.save(file);
-}
-
-bool LanguageModelDictContent::runGC(
- const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const LanguageModelDictContent *const originalContent,
- int *const outNgramCount) {
- return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(),
- 0 /* nextLevelBitmapEntryIndex */, outNgramCount);
-}
-
-ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry(
- const WordIdArrayView prevWordIds, const int wordId) const {
- const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
- if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
- return ProbabilityEntry();
- }
- const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
- if (!result.mIsValid) {
- // Not found.
- return ProbabilityEntry();
- }
- return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo);
-}
-
-bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds,
- const int terminalId, const ProbabilityEntry *const probabilityEntry) {
- const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds);
- if (bitmapEntryIndex == TrieMap::INVALID_INDEX) {
- return false;
- }
- return mTrieMap.put(terminalId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex);
-}
-
-bool LanguageModelDictContent::runGCInner(
- const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const TrieMap::TrieMapRange trieMapRange,
- const int nextLevelBitmapEntryIndex, int *const outNgramCount) {
- for (auto &entry : trieMapRange) {
- const auto it = terminalIdMap->find(entry.key());
- if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) {
- // The word has been removed.
- continue;
- }
- if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) {
- return false;
- }
- if (outNgramCount) {
- *outNgramCount += 1;
- }
- if (entry.hasNextLevelMap()) {
- if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(),
- mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex),
- outNgramCount)) {
- return false;
- }
- }
- }
- return true;
-}
-
-int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const {
- int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex();
- for (const int wordId : prevWordIds) {
- const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex);
- if (!result.mIsValid) {
- return TrieMap::INVALID_INDEX;
- }
- bitmapEntryIndex = result.mNextLevelBitmapEntryIndex;
- }
- return bitmapEntryIndex;
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h
deleted file mode 100644
index 18f2e0170..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (C) 2014, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
-#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H
-
-#include <cstdio>
-
-#include "defines.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h"
-#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/utils/trie_map.h"
-#include "utils/byte_array_view.h"
-#include "utils/int_array_view.h"
-
-namespace latinime {
-
-/**
- * Class representing language model.
- *
- * This class provides methods to get and store unigram/n-gram probability information and flags.
- */
-class LanguageModelDictContent {
- public:
- LanguageModelDictContent(const ReadWriteByteArrayView trieMapBuffer,
- const bool hasHistoricalInfo)
- : mTrieMap(trieMapBuffer), mHasHistoricalInfo(hasHistoricalInfo) {}
-
- explicit LanguageModelDictContent(const bool hasHistoricalInfo)
- : mTrieMap(), mHasHistoricalInfo(hasHistoricalInfo) {}
-
- bool isNearSizeLimit() const {
- return mTrieMap.isNearSizeLimit();
- }
-
- bool save(FILE *const file) const;
-
- bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const LanguageModelDictContent *const originalContent,
- int *const outNgramCount);
-
- ProbabilityEntry getProbabilityEntry(const int wordId) const {
- return getNgramProbabilityEntry(WordIdArrayView(), wordId);
- }
-
- bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) {
- return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry);
- }
-
- ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds,
- const int wordId) const;
-
- bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId,
- const ProbabilityEntry *const probabilityEntry);
-
- private:
- DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent);
-
- TrieMap mTrieMap;
- const bool mHasHistoricalInfo;
-
- bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap,
- const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex,
- int *const outNgramCount);
-
- int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const;
-};
-} // namespace latinime
-#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
deleted file mode 100644
index 723808399..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h"
-
-#include <vector>
-
-#include "suggest/core/dicnode/dic_node.h"
-#include "suggest/core/dicnode/dic_node_vector.h"
-#include "suggest/core/dictionary/ngram_listener.h"
-#include "suggest/core/dictionary/property/bigram_property.h"
-#include "suggest/core/dictionary/property/unigram_property.h"
-#include "suggest/core/dictionary/property/word_property.h"
-#include "suggest/core/session/prev_words_info.h"
-#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
-#include "suggest/policyimpl/dictionary/utils/probability_utils.h"
-
-namespace latinime {
-
-// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and
-// BinaryDictionaryDecayingTests.
-const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT";
-const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT";
-const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT";
-const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT";
-const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024;
-const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS =
- Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS;
-
-void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode,
- DicNodeVector *const childDicNodes) const {
- if (!dicNode->hasChildren()) {
- return;
- }
- DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
- readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos());
- while (!readingHelper.isEnd()) {
- const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams();
- if (!ptNodeParams.isValid()) {
- break;
- }
- bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted();
- if (isTerminal && mHeaderPolicy->isDecayingDict()) {
- // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose
- // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a
- // valid terminal DicNode.
- isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY;
- }
- readingHelper.readNextSiblingNode(ptNodeParams);
- if (ptNodeParams.representsNonWordInfo()) {
- // Skip PtNodes that represent non-word information.
- continue;
- }
- childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(),
- ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal,
- ptNodeParams.hasChildren(),
- ptNodeParams.isBlacklisted()
- || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */,
- ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints());
- }
- if (readingHelper.isError()) {
- mIsCorrupted = true;
- AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
- }
-}
-
-int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount(
- const int ptNodePos, const int maxCodePointCount, int *const outCodePoints,
- int *const outUnigramProbability) const {
- DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
- readingHelper.initWithPtNodePos(ptNodePos);
- const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount(
- maxCodePointCount, outCodePoints, outUnigramProbability);
- if (readingHelper.isError()) {
- mIsCorrupted = true;
- AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount().");
- }
- return codePointCount;
-}
-
-int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord,
- const int length, const bool forceLowerCaseSearch) const {
- DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
- readingHelper.initWithPtNodeArrayPos(getRootPosition());
- const int ptNodePos =
- readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch);
- if (readingHelper.isError()) {
- mIsCorrupted = true;
- AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes().");
- }
- return ptNodePos;
-}
-
-int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability,
- const int bigramProbability) const {
- if (mHeaderPolicy->isDecayingDict()) {
- // Both probabilities are encoded. Decode them and get probability.
- return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability);
- } else {
- if (unigramProbability == NOT_A_PROBABILITY) {
- return NOT_A_PROBABILITY;
- } else if (bigramProbability == NOT_A_PROBABILITY) {
- return ProbabilityUtils::backoff(unigramProbability);
- } else {
- return bigramProbability;
- }
- }
-}
-
-int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos,
- const int ptNodePos) const {
- if (ptNodePos == NOT_A_DICT_POS) {
- return NOT_A_PROBABILITY;
- }
- const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
- if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) {
- return NOT_A_PROBABILITY;
- }
- if (prevWordsPtNodePos) {
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
- BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
- while (bigramsIt.hasNext()) {
- bigramsIt.next();
- if (bigramsIt.getBigramPos() == ptNodePos
- && bigramsIt.getProbability() != NOT_A_PROBABILITY) {
- return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability());
- }
- }
- return NOT_A_PROBABILITY;
- }
- return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY);
-}
-
-void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos,
- NgramListener *const listener) const {
- if (!prevWordsPtNodePos) {
- return;
- }
- const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]);
- BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition);
- while (bigramsIt.hasNext()) {
- bigramsIt.next();
- listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos());
- }
-}
-
-int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const {
- if (ptNodePos == NOT_A_DICT_POS) {
- return NOT_A_DICT_POS;
- }
- const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
- if (ptNodeParams.isDeleted()) {
- return NOT_A_DICT_POS;
- }
- return mBuffers->getShortcutDictContent()->getShortcutListHeadPos(
- ptNodeParams.getTerminalId());
-}
-
-int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const {
- if (ptNodePos == NOT_A_DICT_POS) {
- return NOT_A_DICT_POS;
- }
- const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos));
- if (ptNodeParams.isDeleted()) {
- return NOT_A_DICT_POS;
- }
- return mBuffers->getBigramDictContent()->getBigramListHeadPos(
- ptNodeParams.getTerminalId());
-}
-
-bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length,
- const UnigramProperty *const unigramProperty) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary.");
- return false;
- }
- if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
- AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
- mDictBuffer->getTailPosition());
- return false;
- }
- if (length > MAX_WORD_LENGTH) {
- AKLOGE("The word is too long to insert to the dictionary, length: %d", length);
- return false;
- }
- for (const auto &shortcut : unigramProperty->getShortcuts()) {
- if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
- AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d",
- shortcut.getTargetCodePoints()->size());
- return false;
- }
- }
- DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
- readingHelper.initWithPtNodeArrayPos(getRootPosition());
- bool addedNewUnigram = false;
- int codePointsToAdd[MAX_WORD_LENGTH];
- int codePointCountToAdd = length;
- memmove(codePointsToAdd, word, sizeof(int) * length);
- if (unigramProperty->representsBeginningOfSentence()) {
- codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd,
- codePointCountToAdd, MAX_WORD_LENGTH);
- }
- if (codePointCountToAdd <= 0) {
- return false;
- }
- if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd,
- unigramProperty, &addedNewUnigram)) {
- if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) {
- mUnigramCount++;
- }
- if (unigramProperty->getShortcuts().size() > 0) {
- // Add shortcut target.
- const int wordPos = getTerminalPtNodePositionOfWord(word, length,
- false /* forceLowerCaseSearch */);
- if (wordPos == NOT_A_DICT_POS) {
- AKLOGE("Cannot find terminal PtNode position to add shortcut target.");
- return false;
- }
- for (const auto &shortcut : unigramProperty->getShortcuts()) {
- if (!mUpdatingHelper.addShortcutTarget(wordPos,
- shortcut.getTargetCodePoints()->data(),
- shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) {
- AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, "
- "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(),
- shortcut.getProbability());
- return false;
- }
- }
- }
- return true;
- } else {
- return false;
- }
-}
-
-bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary.");
- return false;
- }
- const int ptNodePos = getTerminalPtNodePositionOfWord(word, length,
- false /* forceLowerCaseSearch */);
- if (ptNodePos == NOT_A_DICT_POS) {
- return false;
- }
- const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) {
- AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos);
- return false;
- }
- if (!ptNodeParams.representsNonWordInfo()) {
- mUnigramCount--;
- }
- return true;
-}
-
-bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const BigramProperty *const bigramProperty) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary.");
- return false;
- }
- if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
- AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
- mDictBuffer->getTailPosition());
- return false;
- }
- if (!prevWordsInfo->isValid()) {
- AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary.");
- return false;
- }
- if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) {
- AKLOGE("The word is too long to insert the ngram to the dictionary. "
- "length: %d", bigramProperty->getTargetCodePoints()->size());
- return false;
- }
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
- false /* tryLowerCaseSearch */);
- const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos);
- // TODO: Support N-gram.
- if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
- if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) {
- const std::vector<UnigramProperty::ShortcutProperty> shortcuts;
- const UnigramProperty beginningOfSentenceUnigramProperty(
- true /* representsBeginningOfSentence */, true /* isNotAWord */,
- false /* isBlacklisted */, MAX_PROBABILITY /* probability */,
- NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts);
- if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */),
- prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */),
- &beginningOfSentenceUnigramProperty)) {
- AKLOGE("Cannot add unigram entry for the beginning-of-sentence.");
- return false;
- }
- // Refresh Terminal PtNode positions.
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
- false /* tryLowerCaseSearch */);
- } else {
- return false;
- }
- }
- const int word1Pos = getTerminalPtNodePositionOfWord(
- bigramProperty->getTargetCodePoints()->data(),
- bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */);
- if (word1Pos == NOT_A_DICT_POS) {
- return false;
- }
- bool addedNewEntry = false;
- if (mUpdatingHelper.addNgramEntry(prevWordsPtNodePosView, word1Pos, bigramProperty,
- &addedNewEntry)) {
- if (addedNewEntry) {
- mBigramCount++;
- }
- return true;
- } else {
- return false;
- }
-}
-
-bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo,
- const int *const word, const int length) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary.");
- return false;
- }
- if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) {
- AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d",
- mDictBuffer->getTailPosition());
- return false;
- }
- if (!prevWordsInfo->isValid()) {
- AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary.");
- return false;
- }
- if (length > MAX_WORD_LENGTH) {
- AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length);
- }
- int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos,
- false /* tryLowerCaseSerch */);
- const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos);
- // TODO: Support N-gram.
- if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) {
- return false;
- }
- const int wordPos = getTerminalPtNodePositionOfWord(word, length,
- false /* forceLowerCaseSearch */);
- if (wordPos == NOT_A_DICT_POS) {
- return false;
- }
- if (mUpdatingHelper.removeNgramEntry(prevWordsPtNodePosView, wordPos)) {
- mBigramCount--;
- return true;
- } else {
- return false;
- }
-}
-
-bool Ver4PatriciaTriePolicy::flush(const char *const filePath) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath);
- return false;
- }
- if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) {
- AKLOGE("Cannot flush the dictionary to file.");
- mIsCorrupted = true;
- return false;
- }
- return true;
-}
-
-bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary.");
- return false;
- }
- if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) {
- AKLOGE("Cannot flush the dictionary to file with GC.");
- mIsCorrupted = true;
- return false;
- }
- return true;
-}
-
-bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const {
- if (!mBuffers->isUpdatable()) {
- AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary.");
- return false;
- }
- if (mBuffers->isNearSizeLimit()) {
- // Additional buffer size is near the limit.
- return true;
- } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize()
- > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) {
- // Total extended region size of the trie exceeds the limit.
- return true;
- } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS
- && mDictBuffer->getUsedAdditionalBufferSize() > 0) {
- // Needs to reduce dictionary size.
- return true;
- } else if (mHeaderPolicy->isDecayingDict()) {
- return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount,
- mHeaderPolicy);
- }
- return false;
-}
-
-void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength,
- char *const outResult, const int maxResultLength) {
- const int compareLength = queryLength + 1 /* terminator */;
- if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d", mUnigramCount);
- } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d", mBigramCount);
- } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d",
- mHeaderPolicy->isDecayingDict() ?
- ForgettingCurveUtils::getUnigramCountHardLimit(
- mHeaderPolicy->getMaxUnigramCount()) :
- static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
- } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) {
- snprintf(outResult, maxResultLength, "%d",
- mHeaderPolicy->isDecayingDict() ?
- ForgettingCurveUtils::getBigramCountHardLimit(
- mHeaderPolicy->getMaxBigramCount()) :
- static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE));
- }
-}
-
-const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints,
- const int codePointCount) const {
- const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount,
- false /* forceLowerCaseSearch */);
- if (ptNodePos == NOT_A_DICT_POS) {
- AKLOGE("getWordProperty is called for invalid word.");
- return WordProperty();
- }
- const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- std::vector<int> codePointVector(ptNodeParams.getCodePoints(),
- ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount());
- const ProbabilityEntry probabilityEntry =
- mBuffers->getLanguageModelDictContent()->getProbabilityEntry(
- ptNodeParams.getTerminalId());
- const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
- // Fetch bigram information.
- std::vector<BigramProperty> bigrams;
- const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos);
- if (bigramListPos != NOT_A_DICT_POS) {
- int bigramWord1CodePoints[MAX_WORD_LENGTH];
- const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent();
- const TerminalPositionLookupTable *const terminalPositionLookupTable =
- mBuffers->getTerminalPositionLookupTable();
- bool hasNext = true;
- int readingPos = bigramListPos;
- while (hasNext) {
- const BigramEntry bigramEntry =
- bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- hasNext = bigramEntry.hasNext();
- const int word1TerminalId = bigramEntry.getTargetTerminalId();
- const int word1TerminalPtNodePos =
- terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId);
- if (word1TerminalPtNodePos == NOT_A_DICT_POS) {
- continue;
- }
- // Word (unigram) probability
- int word1Probability = NOT_A_PROBABILITY;
- const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
- word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints,
- &word1Probability);
- const std::vector<int> word1(bigramWord1CodePoints,
- bigramWord1CodePoints + codePointCount);
- const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo();
- const int probability = bigramEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(
- bigramEntry.getHistoricalInfo(), mHeaderPolicy) :
- bigramEntry.getProbability();
- bigrams.emplace_back(&word1, probability,
- historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
- historicalInfo->getCount());
- }
- }
- // Fetch shortcut information.
- std::vector<UnigramProperty::ShortcutProperty> shortcuts;
- int shortcutPos = getShortcutPositionOfPtNode(ptNodePos);
- if (shortcutPos != NOT_A_DICT_POS) {
- int shortcutTarget[MAX_WORD_LENGTH];
- const ShortcutDictContent *const shortcutDictContent =
- mBuffers->getShortcutDictContent();
- bool hasNext = true;
- while (hasNext) {
- int shortcutTargetLength = 0;
- int shortcutProbability = NOT_A_PROBABILITY;
- shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget,
- &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos);
- const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength);
- shortcuts.emplace_back(&target, shortcutProbability);
- }
- }
- const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(),
- ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(),
- historicalInfo->getTimeStamp(), historicalInfo->getLevel(),
- historicalInfo->getCount(), &shortcuts);
- return WordProperty(&codePointVector, &unigramProperty, &bigrams);
-}
-
-int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints,
- int *const outCodePointCount) {
- *outCodePointCount = 0;
- if (token == 0) {
- mTerminalPtNodePositionsForIteratingWords.clear();
- DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy(
- &mTerminalPtNodePositionsForIteratingWords);
- DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader);
- readingHelper.initWithPtNodeArrayPos(getRootPosition());
- readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy);
- }
- const int terminalPtNodePositionsVectorSize =
- static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size());
- if (token < 0 || token >= terminalPtNodePositionsVectorSize) {
- AKLOGE("Given token %d is invalid.", token);
- return 0;
- }
- const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token];
- int unigramProbability = NOT_A_PROBABILITY;
- *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(
- terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability);
- const int nextToken = token + 1;
- if (nextToken >= terminalPtNodePositionsVectorSize) {
- // All words have been iterated.
- mTerminalPtNodePositionsForIteratingWords.clear();
- return 0;
- }
- return nextToken;
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
deleted file mode 100644
index 4220312e0..000000000
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h"
-
-#include <cstring>
-#include <queue>
-
-#include "suggest/policyimpl/dictionary/header/header_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h"
-#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h"
-#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h"
-#include "suggest/policyimpl/dictionary/utils/file_utils.h"
-#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h"
-
-namespace latinime {
-
-bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath,
- const int unigramCount, const int bigramCount) const {
- const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
- BufferWithExtendableBuffer headerBuffer(
- BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
- const int extendedRegionSize = headerPolicy->getExtendedRegionSize()
- + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize();
- if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */,
- unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) {
- AKLOGE("Cannot write header structure to buffer. "
- "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, "
- "extendedRegionSize: %d", false, unigramCount, bigramCount,
- extendedRegionSize);
- return false;
- }
- return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
-}
-
-bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos,
- const char *const dictDirPath) {
- const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy();
- Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers(
- Ver4DictBuffers::createVer4DictBuffers(headerPolicy,
- Ver4DictConstants::MAX_DICTIONARY_SIZE));
- int unigramCount = 0;
- int bigramCount = 0;
- if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) {
- return false;
- }
- BufferWithExtendableBuffer headerBuffer(
- BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE);
- if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */,
- unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) {
- return false;
- }
- return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer);
-}
-
-bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos,
- const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite,
- int *const outUnigramCount, int *const outBigramCount) {
- Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(),
- mBuffers->getLanguageModelDictContent(), headerPolicy);
- Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer());
- Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(),
- mBuffers->getTerminalPositionLookupTable(), headerPolicy);
- Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(),
- mBuffers->getTerminalPositionLookupTable());
- Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(),
- mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
- &shortcutPolicy);
-
- DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader);
- readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
- DynamicPtGcEventListeners
- ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
- traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted(
- &ptNodeWriter);
- if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
- &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) {
- return false;
- }
- const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted
- .getValidUnigramCount();
- const int maxUnigramCount = headerPolicy->getMaxUnigramCount();
- if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) {
- if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) {
- AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount,
- maxUnigramCount);
- return false;
- }
- }
-
- readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
- DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability
- traversePolicyToUpdateBigramProbability(&ptNodeWriter);
- if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
- &traversePolicyToUpdateBigramProbability)) {
- return false;
- }
- const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount();
- const int maxBigramCount = headerPolicy->getMaxBigramCount();
- if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) {
- if (!truncateBigrams(maxBigramCount)) {
- AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount);
- return false;
- }
- }
-
- // Mapping from positions in mBuffer to positions in bufferToWrite.
- PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap;
- readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
- Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(),
- buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy,
- &shortcutPolicy);
- DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer
- traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers,
- buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap);
- if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
- &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) {
- return false;
- }
-
- // Create policy instances for the GCed dictionary.
- Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(),
- buffersToWrite->getLanguageModelDictContent(), headerPolicy);
- Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer());
- Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(),
- buffersToWrite->getTerminalPositionLookupTable(), headerPolicy);
- Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(),
- buffersToWrite->getTerminalPositionLookupTable());
- Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(),
- buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy,
- &newShortcutPolicy);
- // Re-assign terminal IDs for valid terminal PtNodes.
- TerminalPositionLookupTable::TerminalIdMap terminalIdMap;
- if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds(
- &terminalIdMap)) {
- return false;
- }
- // Run GC for probability dict content.
- if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap,
- mBuffers->getLanguageModelDictContent(), nullptr /* outNgramCount */)) {
- return false;
- }
- // Run GC for bigram dict content.
- if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap,
- mBuffers->getBigramDictContent(), outBigramCount)) {
- return false;
- }
- // Run GC for shortcut dict content.
- if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap,
- mBuffers->getShortcutDictContent())) {
- return false;
- }
- DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader);
- newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
- DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields
- traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap);
- if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner(
- &traversePolicyToUpdateAllPositionFields)) {
- return false;
- }
- newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos);
- TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
- traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap);
- if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner(
- &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) {
- return false;
- }
- *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount();
- return true;
-}
-
-bool Ver4PatriciaTrieWritingHelper::truncateUnigrams(
- const Ver4PatriciaTrieNodeReader *const ptNodeReader,
- Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) {
- const TerminalPositionLookupTable *const terminalPosLookupTable =
- mBuffers->getTerminalPositionLookupTable();
- const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
- std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
- priorityQueue;
- for (int i = 0; i < nextTerminalId; ++i) {
- const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i);
- if (terminalPos == NOT_A_DICT_POS) {
- continue;
- }
- const ProbabilityEntry probabilityEntry =
- mBuffers->getLanguageModelDictContent()->getProbabilityEntry(i);
- const int probability = probabilityEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(
- probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
- probabilityEntry.getProbability();
- priorityQueue.push(DictProbability(terminalPos, probability,
- probabilityEntry.getHistoricalInfo()->getTimeStamp()));
- }
-
- // Delete unigrams.
- while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) {
- const int ptNodePos = priorityQueue.top().getDictPos();
- priorityQueue.pop();
- const PtNodeParams ptNodeParams =
- ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
- if (ptNodeParams.representsNonWordInfo()) {
- continue;
- }
- if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) {
- AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos);
- return false;
- }
- }
- return true;
-}
-
-bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) {
- const TerminalPositionLookupTable *const terminalPosLookupTable =
- mBuffers->getTerminalPositionLookupTable();
- const int nextTerminalId = terminalPosLookupTable->getNextTerminalId();
- std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator>
- priorityQueue;
- BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent();
- for (int i = 0; i < nextTerminalId; ++i) {
- const int bigramListPos = bigramDictContent->getBigramListHeadPos(i);
- if (bigramListPos == NOT_A_DICT_POS) {
- continue;
- }
- bool hasNext = true;
- int readingPos = bigramListPos;
- while (hasNext) {
- const BigramEntry bigramEntry =
- bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos);
- const int entryPos = readingPos - bigramDictContent->getBigramEntrySize();
- hasNext = bigramEntry.hasNext();
- if (!bigramEntry.isValid()) {
- continue;
- }
- const int probability = bigramEntry.hasHistoricalInfo() ?
- ForgettingCurveUtils::decodeProbability(
- bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) :
- bigramEntry.getProbability();
- priorityQueue.push(DictProbability(entryPos, probability,
- bigramEntry.getHistoricalInfo()->getTimeStamp()));
- }
- }
-
- // Delete bigrams.
- while (static_cast<int>(priorityQueue.size()) > maxBigramCount) {
- const int entryPos = priorityQueue.top().getDictPos();
- const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos);
- const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry();
- if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) {
- AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos);
- return false;
- }
- priorityQueue.pop();
- }
- return true;
-}
-
-bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds
- ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) {
- if (!ptNodeParams->isTerminal()) {
- return true;
- }
- TerminalPositionLookupTable::TerminalIdMap::const_iterator it =
- mTerminalIdMap->find(ptNodeParams->getTerminalId());
- if (it == mTerminalIdMap->end()) {
- AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd",
- ptNodeParams->getTerminalId(), mTerminalIdMap->size());
- return false;
- }
- if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) {
- AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second);
- return false;
- }
- return true;
-}
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
index 3fc566e7a..856808a74 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
@@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120;
const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f;
const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f;
+const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f;
const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f;
const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f;
const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
@@ -31,6 +32,7 @@ const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f;
// TODO: Unlimit max cache dic node size
const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170;
const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310;
+const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE = 50;
const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4;
const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f;
@@ -47,18 +49,21 @@ const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f;
const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f;
const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f;
const float ScoringParams::TRANSPOSITION_COST = 0.5608f;
-const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.334f;
-const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f;
+const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.33f;
+const float ScoringParams::SPACE_OMISSION_COST = 0.1f;
+const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.37972f;
const float ScoringParams::SUBSTITUTION_COST = 0.3806f;
-const float ScoringParams::COST_NEW_WORD = 0.0314f;
const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f;
const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f;
const float ScoringParams::COST_FIRST_COMPLETION = 0.4836f;
const float ScoringParams::COST_COMPLETION = 0.00624f;
const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.0683f;
const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f;
-const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f;
+const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.3482f;
const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f;
const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f;
const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.095f;
+const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION = 0.99f;
+const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION = 0.99f;
+const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE = 0.99f;
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h
index b12de6d87..6f327a370 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h
@@ -30,9 +30,11 @@ class ScoringParams {
static const float AUTOCORRECT_OUTPUT_THRESHOLD;
static const int MAX_CACHE_DIC_NODE_SIZE;
static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT;
+ static const int MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE;
static const int THRESHOLD_SHORT_WORD_LENGTH;
static const float EXACT_MATCH_PROMOTION;
+ static const float PERFECT_MATCH_PROMOTION;
static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH;
@@ -55,9 +57,9 @@ class ScoringParams {
static const float INSERTION_COST_FIRST_CHAR;
static const float TRANSPOSITION_COST;
static const float SPACE_SUBSTITUTION_COST;
+ static const float SPACE_OMISSION_COST;
static const float ADDITIONAL_PROXIMITY_COST;
static const float SUBSTITUTION_COST;
- static const float COST_NEW_WORD;
static const float COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE;
static const float DISTANCE_WEIGHT_LANGUAGE;
static const float COST_FIRST_COMPLETION;
@@ -68,6 +70,9 @@ class ScoringParams {
static const float TYPING_BASE_OUTPUT_SCORE;
static const float TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
static const float NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT;
+ static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION;
+ static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION;
+ static const float LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE;
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(ScoringParams);
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
index 04cb6603a..6acd767ea 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h
@@ -33,32 +33,61 @@ class TypingScoring : public Scoring {
static const TypingScoring *getInstance() { return &sInstance; }
AK_FORCE_INLINE void getMostProbableString(const DicTraverseSession *const traverseSession,
- const float languageWeight, SuggestionResults *const outSuggestionResults) const {}
+ const float weightOfLangModelVsSpatialModel,
+ SuggestionResults *const outSuggestionResults) const {}
- AK_FORCE_INLINE float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession,
- DicNode *const terminals, const int size) const {
+ AK_FORCE_INLINE float getAdjustedWeightOfLangModelVsSpatialModel(
+ DicTraverseSession *const traverseSession, DicNode *const terminals,
+ const int size) const {
return 1.0f;
}
AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize,
const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit,
- const bool boostExactMatches) const {
+ const bool boostExactMatches, const bool hasProbabilityZero) const {
const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE
+ static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT;
float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance;
if (forceCommit) {
score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD;
}
- if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
- score += ScoringParams::EXACT_MATCH_PROMOTION;
- if ((ErrorTypeUtils::MATCH_WITH_CASE_ERROR & containedErrorTypes) != 0) {
- score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
+ if (hasProbabilityZero) {
+ // Previously, when both legitimate 0-frequency words (such as distracters) and
+ // offensive words were encoded in the same way, distracters would never show up
+ // when the user blocked offensive words (the default setting, as well as the
+ // setting for regression tests).
+ //
+ // When b/11031090 was fixed and a separate encoding was used for offensive words,
+ // 0-frequency words would no longer be blocked when they were an "exact match"
+ // (where case mismatches and accent mismatches would be considered an "exact
+ // match"). The exact match boosting functionality meant that, for example, when
+ // the user typed "mt" they would be suggested the word "Mt", although they most
+ // probably meant to type "my".
+ //
+ // For this reason, we introduced this change, which does the following:
+ // * Defines the "perfect match" as a really exact match, with no room for case or
+ // accent mismatches
+ // * When the target word has probability zero (as "Mt" does, because it is a
+ // distracter), ONLY boost its score if it is a perfect match.
+ //
+ // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and
+ // they will get "my". However, if the user makes an explicit effort to type "Mt",
+ // we do boost the word "Mt" so that the user's input is not autocorrected to "My".
+ if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) {
+ score += ScoringParams::PERFECT_MATCH_PROMOTION;
}
- if ((ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR & containedErrorTypes) != 0) {
- score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
- }
- if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
- score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
+ } else {
+ if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) {
+ score += ScoringParams::EXACT_MATCH_PROMOTION;
+ if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) {
+ score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH;
+ }
+ if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) {
+ score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH;
+ }
+ if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) {
+ score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH;
+ }
}
}
return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE);
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
index cb3dfac70..b9b6314ae 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
@@ -26,6 +26,7 @@
#include "suggest/core/layout/proximity_info_utils.h"
#include "suggest/core/policy/traversal.h"
#include "suggest/core/session/dic_traverse_session.h"
+#include "suggest/core/suggest_options.h"
#include "suggest/policyimpl/typing/scoring_params.h"
#include "utils/char_utils.h"
@@ -77,6 +78,13 @@ class TypingTraversal : public Traversal {
if (!CORRECT_NEW_WORD_SPACE_SUBSTITUTION) {
return false;
}
+ if (traverseSession->getSuggestOptions()->weightForLocale()
+ < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION) {
+ // Space substitution is heavy, so we skip doing it if the weight for this language
+ // is low because we anticipate the suggestions out of this dictionary are not for
+ // the language the user intends to type in.
+ return false;
+ }
if (!canDoLookAheadCorrection(traverseSession, dicNode)) {
return false;
}
@@ -91,6 +99,13 @@ class TypingTraversal : public Traversal {
if (!CORRECT_NEW_WORD_SPACE_OMISSION) {
return false;
}
+ if (traverseSession->getSuggestOptions()->weightForLocale()
+ < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION) {
+ // Space omission is heavy, so we skip doing it if the weight for this language
+ // is low because we anticipate the suggestions out of this dictionary are not for
+ // the language the user intends to type in.
+ return false;
+ }
const int inputSize = traverseSession->getInputSize();
// TODO: Don't refer to isCompletion?
if (dicNode->isCompletion(inputSize)) {
@@ -141,9 +156,14 @@ class TypingTraversal : public Traversal {
return DicNodeVector::DEFAULT_NODES_SIZE_FOR_OPTIMIZATION;
}
- AK_FORCE_INLINE int getMaxCacheSize(const int inputSize) const {
- return (inputSize <= 1) ? ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT
- : ScoringParams::MAX_CACHE_DIC_NODE_SIZE;
+ AK_FORCE_INLINE int getMaxCacheSize(const int inputSize, const float weightForLocale) const {
+ if (inputSize <= 1) {
+ return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT;
+ }
+ if (weightForLocale < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE) {
+ return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE;
+ }
+ return ScoringParams::MAX_CACHE_DIC_NODE_SIZE;
}
AK_FORCE_INLINE int getTerminalCacheSize() const {
@@ -161,8 +181,8 @@ class TypingTraversal : public Traversal {
return true;
}
- AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode) const {
- const int probability = dicNode->getProbability();
+ AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode,
+ const int probability) const {
if (probability < ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY) {
return false;
}
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
index 54f65c786..a0e54115d 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp
@@ -17,6 +17,7 @@
#include "suggest/policyimpl/typing/typing_weighting.h"
#include "suggest/core/dicnode/dic_node.h"
+#include "suggest/core/layout/proximity_info.h"
#include "suggest/policyimpl/typing/scoring_params.h"
namespace latinime {
@@ -36,30 +37,49 @@ ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType cor
// Compare the node code point with original primary code point on the keyboard.
const ProximityInfoState *const pInfoState =
traverseSession->getProximityInfoState(0);
- const int primaryOriginalCodePoint = pInfoState->getPrimaryOriginalCodePointAt(
+ const int primaryCodePoint = pInfoState->getPrimaryCodePointAt(
dicNode->getInputIndex(0));
const int nodeCodePoint = dicNode->getNodeCodePoint();
- if (primaryOriginalCodePoint == nodeCodePoint) {
+ const int keyIndex = traverseSession->getProximityInfo()->getKeyIndexOf(
+ primaryCodePoint);
+ // TODO: Check whether the input code point is on the keyboard.
+ if (primaryCodePoint == nodeCodePoint) {
// Node code point is same as original code point on the keyboard.
return ErrorTypeUtils::NOT_AN_ERROR;
- } else if (CharUtils::toLowerCase(primaryOriginalCodePoint) ==
+ } else if (CharUtils::toLowerCase(primaryCodePoint) ==
CharUtils::toLowerCase(nodeCodePoint)) {
// Only cases of the code points are different.
- return ErrorTypeUtils::MATCH_WITH_CASE_ERROR;
- } else if (CharUtils::toBaseCodePoint(primaryOriginalCodePoint) ==
- CharUtils::toBaseCodePoint(nodeCodePoint)) {
+ return ErrorTypeUtils::MATCH_WITH_WRONG_CASE;
+ } else if (primaryCodePoint == CharUtils::toBaseCodePoint(nodeCodePoint)) {
// Node code point is a variant of original code point.
- return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR;
- } else {
+ return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT;
+ } else if (CharUtils::toBaseCodePoint(primaryCodePoint)
+ == CharUtils::toBaseCodePoint(nodeCodePoint)) {
+ // Base code points are the same but the code point is intentionally input.
+ if (keyIndex == NOT_AN_INDEX) {
+ return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT;
+ }
+ return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT;
+ } else if (CharUtils::toLowerCase(primaryCodePoint)
+ == CharUtils::toBaseLowerCase(nodeCodePoint)) {
// Node code point is a variant of original code point and the cases are also
// different.
- return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR
- | ErrorTypeUtils::MATCH_WITH_CASE_ERROR;
+ return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT
+ | ErrorTypeUtils::MATCH_WITH_WRONG_CASE;
+ } else {
+ if (keyIndex == NOT_AN_INDEX) {
+ return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT
+ | ErrorTypeUtils::MATCH_WITH_WRONG_CASE;
+ }
+ // Base code points are the same and the cases are different.
+ return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT
+ | ErrorTypeUtils::MATCH_WITH_WRONG_CASE;
}
}
break;
case CT_ADDITIONAL_PROXIMITY:
- return ErrorTypeUtils::PROXIMITY_CORRECTION;
+ // TODO: Change to EDIT_CORRECTION.
+ return ErrorTypeUtils::PROXIMITY_CORRECTION;
case CT_OMISSION:
if (parentDicNode->canBeIntentionalOmission()) {
return ErrorTypeUtils::INTENTIONAL_OMISSION;
@@ -68,6 +88,8 @@ ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType cor
}
break;
case CT_SUBSTITUTION:
+ // TODO: Quit settng PROXIMITY_CORRECTION.
+ return ErrorTypeUtils::EDIT_CORRECTION | ErrorTypeUtils::PROXIMITY_CORRECTION;
case CT_INSERTION:
case CT_TERMINAL_INSERTION:
case CT_TRANSPOSITION:
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
index 84077174d..1338ac81a 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
@@ -150,9 +150,10 @@ class TypingWeighting : public Weighting {
return cost + weightedDistance;
}
- float getNewWordSpatialCost(const DicTraverseSession *const traverseSession,
+ float getSpaceOmissionCost(const DicTraverseSession *const traverseSession,
const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const {
- return ScoringParams::COST_NEW_WORD * traverseSession->getMultiWordCostMultiplier();
+ const float cost = ScoringParams::SPACE_OMISSION_COST;
+ return cost * traverseSession->getMultiWordCostMultiplier();
}
float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession,
@@ -202,7 +203,10 @@ class TypingWeighting : public Weighting {
AK_FORCE_INLINE float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession,
const DicNode *const dicNode) const {
- const float cost = ScoringParams::SPACE_SUBSTITUTION_COST + ScoringParams::COST_NEW_WORD;
+ const int inputIndex = dicNode->getInputIndex(0);
+ const float distanceToSpaceKey = traverseSession->getProximityInfoState(0)
+ ->getPointToKeyLength(inputIndex, KEYCODE_SPACE);
+ const float cost = ScoringParams::SPACE_SUBSTITUTION_COST * distanceToSpaceKey;
return cost * traverseSession->getMultiWordCostMultiplier();
}
diff --git a/native/jni/src/utils/byte_array_view.h b/native/jni/src/utils/byte_array_view.h
index 2c97c6d58..2b778af6f 100644
--- a/native/jni/src/utils/byte_array_view.h
+++ b/native/jni/src/utils/byte_array_view.h
@@ -42,6 +42,13 @@ class ReadOnlyByteArrayView {
return mPtr;
}
+ AK_FORCE_INLINE const ReadOnlyByteArrayView skip(const size_t n) const {
+ if (mSize <= n) {
+ return ReadOnlyByteArrayView();
+ }
+ return ReadOnlyByteArrayView(mPtr + n, mSize - n);
+ }
+
private:
DISALLOW_ASSIGNMENT_OPERATOR(ReadOnlyByteArrayView);
@@ -77,10 +84,12 @@ class ReadWriteByteArrayView {
}
private:
- DISALLOW_ASSIGNMENT_OPERATOR(ReadWriteByteArrayView);
+ // Default copy constructor and assignment operator are used for using this class with STL
+ // containers.
- uint8_t *const mPtr;
- const size_t mSize;
+ // These members cannot be const to have the assignment operator.
+ uint8_t *mPtr;
+ size_t mSize;
};
} // namespace latinime
diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp
index b17e0847d..a43e6dd62 100644
--- a/native/jni/src/utils/char_utils.cpp
+++ b/native/jni/src/utils/char_utils.cpp
@@ -1057,11 +1057,11 @@ static int compare_pair_capital(const void *a, const void *b) {
- static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital);
}
-/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) {
+/* static */ int CharUtils::latin_tolower(const int c) {
struct LatinCapitalSmallPair *p =
static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP,
NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital));
- return p ? p->small : c;
+ return p ? static_cast<int>(p->small) : c;
}
/*
@@ -1117,7 +1117,9 @@ static int compare_pair_capital(const void *a, const void *b) {
// TODO: Check if it's really acceptable to consider ΓΈ a diacritical variant of o
/* U+0100 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0043, 0x0063,
/* U+0108 */ 0x0043, 0x0063, 0x0043, 0x0063, 0x0043, 0x0063, 0x0044, 0x0064,
- /* U+0110 */ 0x0110, 0x0111, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065,
+ /* U+0110 */ 0x0046, 0x0064, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065,
+ // U+0110: Manually changed from 0110 to 0046
+ // U+0111: Manually changed from 0111 to 0064
/* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067,
/* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127,
/* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069,
@@ -1135,6 +1137,9 @@ static int compare_pair_capital(const void *a, const void *b) {
/* U+0170 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0057, 0x0077, 0x0059, 0x0079,
/* U+0178 */ 0x0059, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, 0x0073,
/* U+0180 */ 0x0180, 0x0181, 0x0182, 0x0183, 0x0184, 0x0185, 0x0186, 0x0187,
+ // TODO: A lot of letters are their own base code points, but for
+ // some (e.g. U+0180) it doesn't seem right. Ideally each code point should
+ // be checked individually with all languages it's used in.
/* U+0188 */ 0x0188, 0x0189, 0x018A, 0x018B, 0x018C, 0x018D, 0x018E, 0x018F,
/* U+0190 */ 0x0190, 0x0191, 0x0192, 0x0193, 0x0194, 0x0195, 0x0196, 0x0197,
/* U+0198 */ 0x0198, 0x0199, 0x019A, 0x019B, 0x019C, 0x019D, 0x019E, 0x019F,
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 63786502b..7871c26ef 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -27,20 +27,14 @@ namespace latinime {
class CharUtils {
public:
+ static const std::vector<int> EMPTY_STRING;
+
static AK_FORCE_INLINE bool isAsciiUpper(int c) {
// Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
// be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
return (c >= 'A' && c <= 'Z');
}
- static AK_FORCE_INLINE int toAsciiLower(int c) {
- return c - 'A' + 'a';
- }
-
- static AK_FORCE_INLINE bool isAscii(int c) {
- return isascii(c) != 0;
- }
-
static AK_FORCE_INLINE int toLowerCase(const int c) {
if (isAsciiUpper(c)) {
return toAsciiLower(c);
@@ -48,7 +42,7 @@ class CharUtils {
if (isAscii(c)) {
return c;
}
- return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
+ return latin_tolower(c);
}
static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
@@ -59,7 +53,6 @@ class CharUtils {
// TODO: Do not hardcode here
return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
}
-
static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
int size = 0;
for (; size < arraySize; ++size) {
@@ -91,9 +84,6 @@ class CharUtils {
return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
}
- static unsigned short latin_tolower(const unsigned short c);
- static const std::vector<int> EMPTY_STRING;
-
// Returns updated code point count. Returns 0 when the code points cannot be marked as a
// Beginning-of-Sentence.
static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
@@ -111,6 +101,17 @@ class CharUtils {
return codePointCount + 1;
}
+ // Returns updated code point count.
+ static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints,
+ const int codePointCount) {
+ if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) {
+ return codePointCount;
+ }
+ const int newCodePointCount = codePointCount - 1;
+ memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount);
+ return newCodePointCount;
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
@@ -125,6 +126,16 @@ class CharUtils {
*/
static const int BASE_CHARS_SIZE = 0x0500;
static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
+
+ static AK_FORCE_INLINE bool isAscii(int c) {
+ return isascii(c) != 0;
+ }
+
+ static AK_FORCE_INLINE int toAsciiLower(int c) {
+ return c - 'A' + 'a';
+ }
+
+ static int latin_tolower(const int c);
};
} // namespace latinime
#endif // LATINIME_CHAR_UTILS_H
diff --git a/native/jni/src/utils/int_array_view.h b/native/jni/src/utils/int_array_view.h
index c1ddc9812..e0f671056 100644
--- a/native/jni/src/utils/int_array_view.h
+++ b/native/jni/src/utils/int_array_view.h
@@ -17,8 +17,10 @@
#ifndef LATINIME_INT_ARRAY_VIEW_H
#define LATINIME_INT_ARRAY_VIEW_H
+#include <algorithm>
+#include <array>
#include <cstdint>
-#include <cstdlib>
+#include <cstring>
#include <vector>
#include "defines.h"
@@ -56,14 +58,14 @@ class IntArrayView {
explicit IntArrayView(const std::vector<int> &vector)
: mPtr(vector.data()), mSize(vector.size()) {}
- template <int N>
- AK_FORCE_INLINE static IntArrayView fromFixedSizeArray(const int (&array)[N]) {
- return IntArrayView(array, N);
+ template <size_t N>
+ AK_FORCE_INLINE static IntArrayView fromArray(const std::array<int, N> &array) {
+ return IntArrayView(array.data(), array.size());
}
- // Returns a view that points one int object. Does not take ownership of the given object.
- AK_FORCE_INLINE static IntArrayView fromObject(const int *const object) {
- return IntArrayView(object, 1);
+ // Returns a view that points one int object.
+ AK_FORCE_INLINE static IntArrayView singleElementView(const int *const ptr) {
+ return IntArrayView(ptr, 1);
}
AK_FORCE_INLINE int operator[](const size_t index) const {
@@ -91,6 +93,69 @@ class IntArrayView {
return mPtr + mSize;
}
+ AK_FORCE_INLINE bool contains(const int value) const {
+ return std::find(begin(), end(), value) != end();
+ }
+
+ // Returns the view whose size is smaller than or equal to the given count.
+ AK_FORCE_INLINE const IntArrayView limit(const size_t maxSize) const {
+ return IntArrayView(mPtr, std::min(maxSize, mSize));
+ }
+
+ AK_FORCE_INLINE const IntArrayView skip(const size_t n) const {
+ if (mSize <= n) {
+ return IntArrayView();
+ }
+ return IntArrayView(mPtr + n, mSize - n);
+ }
+
+ template <size_t N>
+ void copyToArray(std::array<int, N> *const buffer, const size_t offset) const {
+ ASSERT(mSize + offset <= N);
+ memmove(buffer->data() + offset, mPtr, sizeof(int) * mSize);
+ }
+
+ AK_FORCE_INLINE int firstOrDefault(const int defaultValue) const {
+ if (empty()) {
+ return defaultValue;
+ }
+ return mPtr[0];
+ }
+
+ AK_FORCE_INLINE int lastOrDefault(const int defaultValue) const {
+ if (empty()) {
+ return defaultValue;
+ }
+ return mPtr[mSize - 1];
+ }
+
+ AK_FORCE_INLINE std::vector<int> toVector() const {
+ return std::vector<int>(begin(), end());
+ }
+
+ std::vector<IntArrayView> split(const int separator, const int limit = S_INT_MAX) const {
+ if (limit <= 0) {
+ return std::vector<IntArrayView>();
+ }
+ std::vector<IntArrayView> result;
+ if (limit == 1) {
+ result.emplace_back(mPtr, mSize);
+ return result;
+ }
+ size_t startIndex = 0;
+ for (size_t i = 0; i < mSize; ++i) {
+ if (mPtr[i] == separator) {
+ result.emplace_back(mPtr + startIndex, i - startIndex);
+ startIndex = i + 1;
+ if (result.size() >= static_cast<size_t>(limit - 1)) {
+ break;
+ }
+ }
+ }
+ result.emplace_back(mPtr + startIndex, mSize - startIndex);
+ return result;
+ }
+
private:
DISALLOW_ASSIGNMENT_OPERATOR(IntArrayView);
@@ -100,6 +165,9 @@ class IntArrayView {
using WordIdArrayView = IntArrayView;
using PtNodePosArrayView = IntArrayView;
+using CodePointArrayView = IntArrayView;
+template <size_t size>
+using WordIdArray = std::array<int, size>;
} // namespace latinime
#endif // LATINIME_MEMORY_VIEW_H
diff --git a/native/jni/src/utils/jni_data_utils.cpp b/native/jni/src/utils/jni_data_utils.cpp
index 5555293d5..41f0623d8 100644
--- a/native/jni/src/utils/jni_data_utils.cpp
+++ b/native/jni/src/utils/jni_data_utils.cpp
@@ -16,9 +16,100 @@
#include "utils/jni_data_utils.h"
+#include "utils/int_array_view.h"
+
namespace latinime {
const int JniDataUtils::CODE_POINT_REPLACEMENT_CHARACTER = 0xFFFD;
const int JniDataUtils::CODE_POINT_NULL = 0;
+/* static */ void JniDataUtils::outputWordProperty(JNIEnv *const env,
+ const WordProperty &wordProperty, jintArray outCodePoints, jbooleanArray outFlags,
+ jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
+ jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
+ jobject outNgramProbabilities, jobject outShortcutTargets,
+ jobject outShortcutProbabilities) {
+ const CodePointArrayView codePoints = wordProperty.getCodePoints();
+ JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
+ MAX_WORD_LENGTH /* maxLength */, codePoints.data(), codePoints.size(),
+ false /* needsNullTermination */);
+ const UnigramProperty &unigramProperty = wordProperty.getUnigramProperty();
+ const std::vector<NgramProperty> &ngrams = wordProperty.getNgramProperties();
+ jboolean flags[] = {unigramProperty.isNotAWord(), unigramProperty.isPossiblyOffensive(),
+ !ngrams.empty(), unigramProperty.hasShortcuts(),
+ unigramProperty.representsBeginningOfSentence()};
+ env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags);
+ const HistoricalInfo &historicalInfo = unigramProperty.getHistoricalInfo();
+ int probabilityInfo[] = {unigramProperty.getProbability(), historicalInfo.getTimestamp(),
+ historicalInfo.getLevel(), historicalInfo.getCount()};
+ env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo),
+ probabilityInfo);
+
+ jclass integerClass = env->FindClass("java/lang/Integer");
+ jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V");
+ jclass arrayListClass = env->FindClass("java/util/ArrayList");
+ jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
+
+ // Output ngrams.
+ jclass intArrayClass = env->FindClass("[I");
+ for (const auto &ngramProperty : ngrams) {
+ const NgramContext *const ngramContext = ngramProperty.getNgramContext();
+ jobjectArray prevWordWordCodePointsArray = env->NewObjectArray(
+ ngramContext->getPrevWordCount(), intArrayClass, nullptr);
+ jbooleanArray prevWordIsBeginningOfSentenceArray =
+ env->NewBooleanArray(ngramContext->getPrevWordCount());
+ for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) {
+ const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1);
+ jintArray prevWordCodePoints = env->NewIntArray(codePoints.size());
+ JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */,
+ codePoints.size(), codePoints.data(), codePoints.size(),
+ false /* needsNullTermination */);
+ env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints);
+ env->DeleteLocalRef(prevWordCodePoints);
+ JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i,
+ ngramContext->isNthPrevWordBeginningOfSentence(i + 1));
+ }
+ env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray);
+ env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId,
+ prevWordIsBeginningOfSentenceArray);
+ env->DeleteLocalRef(prevWordWordCodePointsArray);
+ env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray);
+
+ const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints();
+ jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size());
+ JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */,
+ targetWordCodePoints->size(), targetWordCodePoints->data(),
+ targetWordCodePoints->size(), false /* needsNullTermination */);
+ env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray);
+ env->DeleteLocalRef(targetWordCodePointArray);
+
+ const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
+ int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
+ ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
+ ngramHistoricalInfo.getCount()};
+ jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
+ env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
+ NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
+ env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray);
+ env->DeleteLocalRef(bigramProbabilityInfoArray);
+ }
+
+ // Output shortcuts.
+ for (const auto &shortcut : unigramProperty.getShortcuts()) {
+ const std::vector<int> *const targetCodePoints = shortcut.getTargetCodePoints();
+ jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size());
+ JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */,
+ targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(),
+ false /* needsNullTermination */);
+ env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray);
+ env->DeleteLocalRef(shortcutTargetCodePointArray);
+ jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId,
+ shortcut.getProbability());
+ env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability);
+ env->DeleteLocalRef(integerProbability);
+ }
+ env->DeleteLocalRef(integerClass);
+ env->DeleteLocalRef(arrayListClass);
+}
+
} // namespace latinime
diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h
index cb82d3c3b..8024e34c4 100644
--- a/native/jni/src/utils/jni_data_utils.h
+++ b/native/jni/src/utils/jni_data_utils.h
@@ -20,10 +20,11 @@
#include <vector>
#include "defines.h"
+#include "dictionary/header/header_read_write_utils.h"
+#include "dictionary/interface/dictionary_header_structure_policy.h"
+#include "dictionary/property/ngram_context.h"
+#include "dictionary/property/word_property.h"
#include "jni.h"
-#include "suggest/core/session/prev_words_info.h"
-#include "suggest/core/policy/dictionary_header_structure_policy.h"
-#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
#include "utils/char_utils.h"
namespace latinime {
@@ -50,6 +51,7 @@ class JniDataUtils {
const jsize keyUtf8Length = env->GetStringUTFLength(keyString);
char keyChars[keyUtf8Length + 1];
env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars);
+ env->DeleteLocalRef(keyString);
keyChars[keyUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::key_type key;
HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key);
@@ -59,6 +61,7 @@ class JniDataUtils {
const jsize valueUtf8Length = env->GetStringUTFLength(valueString);
char valueChars[valueUtf8Length + 1];
env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars);
+ env->DeleteLocalRef(valueString);
valueChars[valueUtf8Length] = '\0';
DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value;
HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value);
@@ -96,18 +99,14 @@ class JniDataUtils {
}
}
- static PrevWordsInfo constructPrevWordsInfo(JNIEnv *env, jobjectArray prevWordCodePointArrays,
- jbooleanArray isBeginningOfSentenceArray) {
+ static NgramContext constructNgramContext(JNIEnv *env, jobjectArray prevWordCodePointArrays,
+ jbooleanArray isBeginningOfSentenceArray, const size_t prevWordCount) {
int prevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH];
int prevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
bool isBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM];
- jsize prevWordsCount = env->GetArrayLength(prevWordCodePointArrays);
- for (size_t i = 0; i < NELEMS(prevWordCodePoints); ++i) {
+ for (size_t i = 0; i < prevWordCount; ++i) {
prevWordCodePointCount[i] = 0;
isBeginningOfSentence[i] = false;
- if (prevWordsCount <= static_cast<int>(i)) {
- continue;
- }
jintArray prevWord = (jintArray)env->GetObjectArrayElement(prevWordCodePointArrays, i);
if (!prevWord) {
continue;
@@ -117,14 +116,15 @@ class JniDataUtils {
continue;
}
env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]);
+ env->DeleteLocalRef(prevWord);
prevWordCodePointCount[i] = prevWordLength;
jboolean isBeginningOfSentenceBoolean = JNI_FALSE;
env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */,
&isBeginningOfSentenceBoolean);
isBeginningOfSentence[i] = isBeginningOfSentenceBoolean == JNI_TRUE;
}
- return PrevWordsInfo(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence,
- MAX_PREV_WORD_COUNT_FOR_N_GRAM);
+ return NgramContext(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence,
+ prevWordCount);
}
static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index,
@@ -141,6 +141,12 @@ class JniDataUtils {
env->SetFloatArrayRegion(array, index, 1 /* len */, &value);
}
+ static void outputWordProperty(JNIEnv *const env, const WordProperty &wordProperty,
+ jintArray outCodePoints, jbooleanArray outFlags, jintArray outProbabilityInfo,
+ jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray,
+ jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets,
+ jobject outShortcutProbabilities);
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(JniDataUtils);
diff --git a/native/jni/src/utils/ngram_utils.h b/native/jni/src/utils/ngram_utils.h
new file mode 100644
index 000000000..fa85ba35f
--- /dev/null
+++ b/native/jni/src/utils/ngram_utils.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_NGRAM_UTILS_H
+#define LATINIME_NGRAM_UTILS_H
+
+#include "defines.h"
+
+namespace latinime {
+
+enum class NgramType : int {
+ Unigram = 0,
+ Bigram = 1,
+ Trigram = 2,
+ Quadgram = 3,
+ NotANgramType = -1,
+};
+
+namespace AllNgramTypes {
+// Use anonymous namespace to avoid ODR (One Definition Rule) violation.
+namespace {
+
+const NgramType ASCENDING[] = {
+ NgramType::Unigram, NgramType::Bigram, NgramType::Trigram
+};
+
+const NgramType DESCENDING[] = {
+ NgramType::Trigram, NgramType::Bigram, NgramType::Unigram
+};
+
+} // namespace
+} // namespace AllNgramTypes
+
+class NgramUtils final {
+ public:
+ static AK_FORCE_INLINE NgramType getNgramTypeFromWordCount(const int wordCount) {
+ // Max supported ngram is (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram.
+ if (wordCount <= 0 || wordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1) {
+ return NgramType::NotANgramType;
+ }
+ // Convert word count to 0-origin enum value.
+ return static_cast<NgramType>(wordCount - 1);
+ }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(NgramUtils);
+
+};
+}
+#endif /* LATINIME_NGRAM_UTILS_H */
diff --git a/native/jni/src/utils/profiler.h b/native/jni/src/utils/profiler.h
new file mode 100644
index 000000000..5f107fed3
--- /dev/null
+++ b/native/jni/src/utils/profiler.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2014, The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_PROFILER_H
+#define LATINIME_PROFILER_H
+
+#ifdef FLAG_DO_PROFILE
+
+#include "defines.h"
+
+#include <ctime>
+#include <unordered_map>
+
+namespace latinime {
+
+class Profiler final {
+ public:
+ Profiler(const clockid_t clockId)
+ : mClockId(clockId), mStartTime(getTimeInMicroSec()), mStartTimes(), mTimes(),
+ mCounters() {}
+
+ ~Profiler() {
+ const float totalTime =
+ static_cast<float>(getTimeInMicroSec() - mStartTime) / 1000.f;
+ AKLOGI("Total time is %6.3f ms.", totalTime);
+ for (const auto &time : mTimes) {
+ AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", time.first,
+ time.second / totalTime * 100.0f, time.second, mCounters[time.first]);
+ }
+ }
+
+ void startTimer(const int id) {
+ mStartTimes[id] = getTimeInMicroSec();
+ }
+
+ void endTimer(const int id) {
+ mTimes[id] += static_cast<float>(getTimeInMicroSec() - mStartTimes[id]) / 1000.0f;
+ mCounters[id]++;
+ }
+
+ operator bool() const { return false; }
+
+ private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Profiler);
+
+ const clockid_t mClockId;
+ int64_t mStartTime;
+ std::unordered_map<int, int64_t> mStartTimes;
+ std::unordered_map<int, float> mTimes;
+ std::unordered_map<int, int> mCounters;
+
+ int64_t getTimeInMicroSec() {
+ timespec time;
+ clock_gettime(mClockId, &time);
+ return static_cast<int64_t>(time.tv_sec) * 1000000
+ + static_cast<int64_t>(time.tv_nsec) / 1000;
+ }
+};
+} // namespace latinime
+
+#define PROF_INIT Profiler __LATINIME__PROFILER__(CLOCK_THREAD_CPUTIME_ID)
+#define PROF_TIMER_START(timer_id) __LATINIME__PROFILER__.startTimer(timer_id)
+#define PROF_TIMER_END(timer_id) __LATINIME__PROFILER__.endTimer(timer_id)
+
+#else // FLAG_DO_PROFILE
+
+#define PROF_INIT
+#define PROF_TIMER_START(timer_id)
+#define PROF_TIMER_END(timer_id)
+
+#endif // FLAG_DO_PROFILE
+
+#endif /* LATINIME_PROFILER_H */