diff options
Diffstat (limited to 'native/jni/src')
-rw-r--r-- | native/jni/src/defines.h | 74 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp) | 73 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h) | 127 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_read_write_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp) | 41 | ||||
-rw-r--r-- | native/jni/src/dictionary/header/header_read_write_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h) | 11 | ||||
-rw-r--r-- | native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/interface/dictionary_header_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h (renamed from native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h) | 55 | ||||
-rw-r--r-- | native/jni/src/dictionary/interface/ngram_listener.h (renamed from native/jni/src/suggest/core/dictionary/ngram_listener.h) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/historical_info.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h) | 5 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/ngram_context.cpp | 123 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/ngram_context.h | 78 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/ngram_property.h | 62 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/unigram_property.h | 137 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/word_attributes.h | 68 | ||||
-rw-r--r-- | native/jni/src/dictionary/property/word_property.h (renamed from native/jni/src/suggest/core/dictionary/property/word_property.h) | 32 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/Readme.txt (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp) | 35 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp) | 12 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp) | 21 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h) | 12 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h) | 14 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h) | 18 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp) | 25 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp) | 64 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h) | 17 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp) | 421 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h) | 88 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp) | 47 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp) | 52 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp) | 35 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h) | 7 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp) | 17 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h) | 15 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp) | 140 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h) | 47 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp) | 33 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h) | 25 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/pt_node_params.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h) | 29 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/pt_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/pt_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h) | 6 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp) | 20 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h) | 13 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h) | 19 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp) | 264 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h) | 91 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h) | 18 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp) | 16 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h) | 20 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp) | 18 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h) | 9 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp | 34 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h | 77 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp | 478 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h | 258 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp | 32 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h | 101 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/probability_entry.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h) | 93 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h) | 14 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/single_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h) | 13 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h) | 21 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp) | 7 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp) | 46 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h) | 28 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp) | 36 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_dict_constants.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h) | 20 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp) | 36 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h) | 15 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp) | 135 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h) | 33 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp | 603 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h) | 87 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp | 185 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h) | 59 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h (renamed from native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h (renamed from native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h (renamed from native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h) | 10 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/bloom_filter.h (renamed from native/jni/src/suggest/core/dictionary/bloom_filter.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp) | 8 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/byte_array_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/byte_array_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h) | 27 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/dict_file_writing_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp) | 24 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/dict_file_writing_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/entry_counters.h | 89 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/file_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/file_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/forgetting_curve_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp) | 91 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/forgetting_curve_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h) | 27 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/format_utils.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp) | 28 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/format_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h) | 14 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/mmapped_buffer.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp) | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/mmapped_buffer.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h) | 0 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/multi_bigram_map.cpp (renamed from native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp) | 55 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/multi_bigram_map.h (renamed from native/jni/src/suggest/core/dictionary/multi_bigram_map.h) | 26 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/probability_utils.cpp | 23 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/probability_utils.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h) | 15 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/sparse_table.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp) | 2 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/sparse_table.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h) | 3 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/trie_map.cpp (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp) | 87 | ||||
-rw-r--r-- | native/jni/src/dictionary/utils/trie_map.h (renamed from native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h) | 19 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/dic_node.h | 60 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/dic_node_utils.cpp | 31 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/dic_node_utils.h | 7 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/dic_node_vector.h | 11 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h | 84 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h | 6 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/dictionary.cpp | 123 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/dictionary.h | 53 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/dictionary_utils.cpp | 32 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/dictionary_utils.h | 3 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/digraph_utils.cpp | 2 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/error_type_utils.cpp | 21 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/error_type_utils.h | 15 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/property/bigram_property.h | 66 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/property/unigram_property.h | 114 | ||||
-rw-r--r-- | native/jni/src/suggest/core/dictionary/property/word_property.cpp | 84 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/additional_proximity_chars.cpp | 2 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/additional_proximity_chars.h | 27 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/geometry_utils.h | 14 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/proximity_info.cpp | 21 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/proximity_info.h | 14 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/proximity_info_state.cpp | 4 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/proximity_info_state.h | 3 | ||||
-rw-r--r-- | native/jni/src/suggest/core/layout/proximity_info_utils.h | 11 | ||||
-rw-r--r-- | native/jni/src/suggest/core/policy/scoring.h | 10 | ||||
-rw-r--r-- | native/jni/src/suggest/core/policy/traversal.h | 5 | ||||
-rw-r--r-- | native/jni/src/suggest/core/policy/weighting.cpp | 14 | ||||
-rw-r--r-- | native/jni/src/suggest/core/policy/weighting.h | 2 | ||||
-rw-r--r-- | native/jni/src/suggest/core/result/suggestion_results.cpp | 7 | ||||
-rw-r--r-- | native/jni/src/suggest/core/result/suggestion_results.h | 12 | ||||
-rw-r--r-- | native/jni/src/suggest/core/result/suggestions_output_utils.cpp | 125 | ||||
-rw-r--r-- | native/jni/src/suggest/core/result/suggestions_output_utils.h | 13 | ||||
-rw-r--r-- | native/jni/src/suggest/core/session/dic_traverse_session.cpp | 20 | ||||
-rw-r--r-- | native/jni/src/suggest/core/session/dic_traverse_session.h | 23 | ||||
-rw-r--r-- | native/jni/src/suggest/core/session/prev_words_info.h | 162 | ||||
-rw-r--r-- | native/jni/src/suggest/core/suggest.cpp | 47 | ||||
-rw-r--r-- | native/jni/src/suggest/core/suggest.h | 3 | ||||
-rw-r--r-- | native/jni/src/suggest/core/suggest_interface.h | 3 | ||||
-rw-r--r-- | native/jni/src/suggest/core/suggest_options.h | 9 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp | 282 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h | 72 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp | 219 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h | 128 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h | 99 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp | 95 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h | 83 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp | 551 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp | 294 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/scoring_params.cpp | 13 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/scoring_params.h | 7 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/typing_scoring.h | 55 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/typing_traversal.h | 30 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp | 44 | ||||
-rw-r--r-- | native/jni/src/suggest/policyimpl/typing/typing_weighting.h | 10 | ||||
-rw-r--r-- | native/jni/src/utils/byte_array_view.h | 15 | ||||
-rw-r--r-- | native/jni/src/utils/char_utils.cpp | 11 | ||||
-rw-r--r-- | native/jni/src/utils/char_utils.h | 37 | ||||
-rw-r--r-- | native/jni/src/utils/int_array_view.h | 82 | ||||
-rw-r--r-- | native/jni/src/utils/jni_data_utils.cpp | 91 | ||||
-rw-r--r-- | native/jni/src/utils/jni_data_utils.h | 30 | ||||
-rw-r--r-- | native/jni/src/utils/ngram_utils.h | 63 | ||||
-rw-r--r-- | native/jni/src/utils/profiler.h | 86 |
200 files changed, 5093 insertions, 4318 deletions
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h index 24d04e51f..10b930e4f 100644 --- a/native/jni/src/defines.h +++ b/native/jni/src/defines.h @@ -23,10 +23,10 @@ #define AK_FORCE_INLINE inline #endif // __GNUC__ -#if defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#if defined(FLAG_DBG) #undef AK_FORCE_INLINE #define AK_FORCE_INLINE inline -#endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) +#endif // defined(FLAG_DBG) // Must be equal to Constants.Dictionary.MAX_WORD_LENGTH in Java #define MAX_WORD_LENGTH 48 @@ -119,7 +119,7 @@ static inline void dumpWordInfo(const int *word, const int length, const int ran const int probability) { static char charBuf[50]; const int N = intArrayToCharArray(word, length, charBuf, NELEMS(charBuf)); - if (N > 1) { + if (N > 0) { AKLOGI("%2d [ %s ] (%d)", rank, charBuf, probability); } } @@ -172,69 +172,6 @@ static inline void showStackTrace() { #define INTS_TO_CHARS(input, length, output) #endif // defined(FLAG_DO_PROFILE) || defined(FLAG_DBG) -#ifdef FLAG_DO_PROFILE -// Profiler -#include <time.h> - -#define PROF_BUF_SIZE 100 -static float profile_buf[PROF_BUF_SIZE]; -static float profile_old[PROF_BUF_SIZE]; -static unsigned int profile_counter[PROF_BUF_SIZE]; - -#define PROF_RESET prof_reset() -#define PROF_COUNT(prof_buf_id) ++profile_counter[prof_buf_id] -#define PROF_OPEN do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while (0) -#define PROF_START(prof_buf_id) do { \ - PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while (0) -#define PROF_CLOSE do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while (0) -#define PROF_END(prof_buf_id) profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id]) -#define PROF_CLOCKOUT(prof_buf_id) \ - AKLOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id])) -#define PROF_OUTALL do { AKLOGI("--- %s ---", __FUNCTION__); prof_out(); } while (0) - -static inline void prof_reset(void) { - for (int i = 0; i < PROF_BUF_SIZE; ++i) { - profile_buf[i] = 0; - profile_old[i] = 0; - profile_counter[i] = 0; - } -} - -static inline void prof_out(void) { - if (profile_counter[PROF_BUF_SIZE - 1] != 1) { - AKLOGI("Error: You must call PROF_OPEN before PROF_CLOSE."); - } - AKLOGI("Total time is %6.3f ms.", - profile_buf[PROF_BUF_SIZE - 1] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC)); - float all = 0.0f; - for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { - all += profile_buf[i]; - } - if (all < 1.0f) all = 1.0f; - for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) { - if (profile_buf[i] > 0.0f) { - AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", - i, (profile_buf[i] * 100.0f / all), - profile_buf[i] * 1000.0f / static_cast<float>(CLOCKS_PER_SEC), - profile_counter[i]); - } - } -} - -#else // FLAG_DO_PROFILE -#define PROF_BUF_SIZE 0 -#define PROF_RESET -#define PROF_COUNT(prof_buf_id) -#define PROF_OPEN -#define PROF_START(prof_buf_id) -#define PROF_CLOSE -#define PROF_END(prof_buf_id) -#define PROF_CLOCK_OUT(prof_buf_id) -#define PROF_CLOCKOUT(prof_buf_id) -#define PROF_OUTALL - -#endif // FLAG_DO_PROFILE - #ifdef FLAG_DBG #define DEBUG_DICT true #define DEBUG_DICT_FULL false @@ -299,8 +236,9 @@ static inline void prof_out(void) { #define NOT_AN_INDEX (-1) #define NOT_A_PROBABILITY (-1) #define NOT_A_DICT_POS (S_INT_MIN) +#define NOT_A_WORD_ID (S_INT_MIN) #define NOT_A_TIMESTAMP (-1) -#define NOT_A_LANGUAGE_WEIGHT (-1.0f) +#define NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1.0f) // A special value to mean the first word confidence makes no sense in this case, // e.g. this is not a multi-word suggestion. @@ -337,7 +275,7 @@ static inline void prof_out(void) { #define MAX_POINTER_COUNT_G 2 // (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram is supported. -#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 1 +#define MAX_PREV_WORD_COUNT_FOR_N_GRAM 3 #define DISALLOW_DEFAULT_CONSTRUCTOR(TypeName) \ TypeName() = delete diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/dictionary/header/header_policy.cpp index 6ed65d921..d4f84d39f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/dictionary/header/header_policy.cpp @@ -14,10 +14,12 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "dictionary/header/header_policy.h" #include <algorithm> +#include "utils/ngram_utils.h" + namespace latinime { // Note that these are corresponding definitions in Java side in DictionaryHeader. @@ -28,33 +30,23 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; -const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; -const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; +const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] = + {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"}; +const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] = + {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT", + "MAX_QUADGRAM_ENTRY_COUNT"}; +const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000}; const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE"; // Historical info is information that is needed to support decaying such as timestamp, level and // count. const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO"; const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration -const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY = - "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP"; const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY = "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID"; -const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY = - "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS"; - -const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT"; -const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT"; const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; -const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 2; const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3; -// 30 days -const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS = - 30 * 24 * 60 * 60; - -const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000; -const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000; // Used for logging. Question mark is used to indicate that the key is not found. void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, @@ -100,12 +92,11 @@ bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { } bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, - const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const { + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const { int writingPos = 0; DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap); - fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount, - extendedRegionSize, &attributeMapToWrite); + fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite); if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, &writingPos)) { return false; @@ -132,11 +123,22 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTim return true; } -void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount, - const int bigramCount, const int extendedRegionSize, +namespace { + +int getIndexFromNgramType(const NgramType ngramType) { + return static_cast<int>(ngramType); +} + +} // namespace + +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, + const EntryCounts &entryCounts, const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const { - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount); - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], + entryCounts.getNgramCount(ngramType)); + } HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); // Set the current time as the generation time. @@ -157,4 +159,25 @@ void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int uni return attributeMap; } +/* static */ const EntryCounts HeaderPolicy::readNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */); + entryCounters.setNgramCount(ngramType, entryCount); + } + return entryCounters.getEntryCounts(); +} + +/* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const { + MutableEntryCounters entryCounters; + for (const auto ngramType : AllNgramTypes::ASCENDING) { + const int index = getIndexFromNgramType(ngramType); + const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]); + entryCounters.setNgramCount(ngramType, maxEntryCount); + } + return entryCounters.getEntryCounts(); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/dictionary/header/header_policy.h index 87cf0cd3b..47cc9196a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/dictionary/header/header_policy.h @@ -20,9 +20,10 @@ #include <cstdint> #include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/format_utils.h" #include "utils/char_utils.h" #include "utils/time_keeper.h" @@ -45,27 +46,15 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), - mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - BIGRAM_COUNT_KEY, 0 /* defaultValue */)), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), - mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, - DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, - DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), - mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), - mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, @@ -82,22 +71,14 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), - mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), + mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()), + mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), - mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, - DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), - mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, - DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), - mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), - mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( - &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} + mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {} // Copy header information HeaderPolicy(const HeaderPolicy *const headerPolicy) @@ -108,27 +89,22 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), mIsDecayingDict(headerPolicy->mIsDecayingDict), mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), - mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), + mNgramCounts(headerPolicy->mNgramCounts), + mMaxNgramCounts(headerPolicy->mMaxNgramCounts), mExtendedRegionSize(headerPolicy->mExtendedRegionSize), mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), - mForgettingCurveOccurrencesToLevelUp( - headerPolicy->mForgettingCurveOccurrencesToLevelUp), mForgettingCurveProbabilityValuesTableId( headerPolicy->mForgettingCurveProbabilityValuesTableId), - mForgettingCurveDurationToLevelDown( - headerPolicy->mForgettingCurveDurationToLevelDown), - mMaxUnigramCount(headerPolicy->mMaxUnigramCount), - mMaxBigramCount(headerPolicy->mMaxBigramCount) {} + mCodePointTable(headerPolicy->mCodePointTable) {} // Temporary dummy header. HeaderPolicy() : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), - mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), + mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), - mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0), - mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {} + mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {} ~HeaderPolicy() {} @@ -138,13 +114,17 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { // same so we use them for both here. switch (mDictFormatVersion) { case FormatUtils::VERSION_2: - return FormatUtils::VERSION_2; + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return FormatUtils::UNKNOWN_VERSION; + case FormatUtils::VERSION_202: + return FormatUtils::VERSION_202; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: return FormatUtils::VERSION_4_ONLY_FOR_TESTING; - case FormatUtils::VERSION_4: - return FormatUtils::VERSION_4; - case FormatUtils::VERSION_4_DEV: - return FormatUtils::VERSION_4_DEV; + case FormatUtils::VERSION_402: + return FormatUtils::VERSION_402; + case FormatUtils::VERSION_403: + return FormatUtils::VERSION_403; default: return FormatUtils::UNKNOWN_VERSION; } @@ -186,12 +166,12 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mLastDecayedTime; } - AK_FORCE_INLINE int getUnigramCount() const { - return mUnigramCount; + AK_FORCE_INLINE const EntryCounts &getNgramCounts() const { + return mNgramCounts; } - AK_FORCE_INLINE int getBigramCount() const { - return mBigramCount; + AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const { + return mMaxNgramCounts; } AK_FORCE_INLINE int getExtendedRegionSize() const { @@ -211,35 +191,19 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return &mAttributeMap; } - AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const { - return mForgettingCurveOccurrencesToLevelUp; - } - AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { return mForgettingCurveProbabilityValuesTableId; } - AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const { - return mForgettingCurveDurationToLevelDown; - } - - AK_FORCE_INLINE int getMaxUnigramCount() const { - return mMaxUnigramCount; - } - - AK_FORCE_INLINE int getMaxBigramCount() const { - return mMaxBigramCount; - } - void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, - const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const; + const EntryCounts &entryCounts, const int extendedRegionSize, + BufferWithExtendableBuffer *const outBuffer) const; - void fillInHeader(const bool updatesLastDecayedTime, - const int unigramCount, const int bigramCount, const int extendedRegionSize, + void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts, + const int extendedRegionSize, DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; AK_FORCE_INLINE const std::vector<int> *getLocale() const { @@ -247,7 +211,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { } bool supportsBeginningOfSentence() const { - return mDictFormatVersion >= FormatUtils::VERSION_4; + return mDictFormatVersion >= FormatUtils::VERSION_402; + } + + const int *getCodePointTable() const { + return mCodePointTable; } private: @@ -258,23 +226,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const IS_DECAYING_DICT_KEY; static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; - static const char *const UNIGRAM_COUNT_KEY; - static const char *const BIGRAM_COUNT_KEY; + static const char *const NGRAM_COUNT_KEYS[]; + static const char *const MAX_NGRAM_COUNT_KEYS[]; + static const int DEFAULT_MAX_NGRAM_COUNTS[]; static const char *const EXTENDED_REGION_SIZE_KEY; static const char *const HAS_HISTORICAL_INFO_KEY; static const char *const LOCALE_KEY; static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; - static const char *const MAX_UNIGRAM_COUNT_KEY; - static const char *const MAX_BIGRAM_COUNT_KEY; static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; - static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; - static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS; - static const int DEFAULT_MAX_UNIGRAM_COUNT; - static const int DEFAULT_MAX_BIGRAM_COUNT; const FormatUtils::FORMAT_VERSION mDictFormatVersion; const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; @@ -286,20 +249,18 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const bool mIsDecayingDict; const int mDate; const int mLastDecayedTime; - const int mUnigramCount; - const int mBigramCount; + const EntryCounts mNgramCounts; + const EntryCounts mMaxNgramCounts; const int mExtendedRegionSize; const bool mHasHistoricalInfoOfWords; - const int mForgettingCurveOccurrencesToLevelUp; const int mForgettingCurveProbabilityValuesTableId; - const int mForgettingCurveDurationToLevelDown; - const int mMaxUnigramCount; - const int mMaxBigramCount; + const int *const mCodePointTable; const std::vector<int> readLocale() const; float readMultipleWordCostMultiplier() const; bool readRequiresGermanUmlautProcessing() const; - + const EntryCounts readNgramCounts() const; + const EntryCounts readMaxNgramCounts() const; static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/dictionary/header/header_read_write_utils.cpp index a8f8f284b..779f8b8c3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp +++ b/native/jni/src/dictionary/header/header_read_write_utils.cpp @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "dictionary/header/header_read_write_utils.h" #include <cctype> #include <cstdio> +#include <memory> #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { @@ -34,12 +35,13 @@ namespace latinime { const int HeaderReadWriteUtils::LARGEST_INT_DIGIT_COUNT = 11; const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; -const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 2048; const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; +const char *const HeaderReadWriteUtils::CODE_POINT_TABLE_KEY = "codePointTable"; const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; @@ -73,20 +75,32 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; return; } int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; - int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; + std::unique_ptr<int[]> valueBuffer(new int[MAX_ATTRIBUTE_VALUE_LENGTH]); while (pos < headerSize) { + // The values in the header don't use the code point table for their encoding. const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); + MAX_ATTRIBUTE_KEY_LENGTH, nullptr /* codePointTable */, keyBuffer, &pos); std::vector<int> key; key.insert(key.end(), keyBuffer, keyBuffer + keyLength); const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); + MAX_ATTRIBUTE_VALUE_LENGTH, nullptr /* codePointTable */, valueBuffer.get(), &pos); std::vector<int> value; - value.insert(value.end(), valueBuffer, valueBuffer + valueLength); + value.insert(value.end(), valueBuffer.get(), valueBuffer.get() + valueLength); headerAttributes->insert(AttributeMap::value_type(key, value)); } } +/* static */ const int *HeaderReadWriteUtils::readCodePointTable( + AttributeMap *const headerAttributes) { + AttributeMap::key_type keyVector; + insertCharactersIntoVector(CODE_POINT_TABLE_KEY, &keyVector); + AttributeMap::const_iterator it = headerAttributes->find(keyVector); + if (it == headerAttributes->end()) { + return nullptr; + } + return it->second.data(); +} + /* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, int *const writingPos) { @@ -96,11 +110,13 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; } switch (version) { case FormatUtils::VERSION_2: - // Version 2 dictionary writing is not supported. + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + // None of the static dictionaries (v2x) support writing return false; case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: return buffer->writeUintAndAdvancePosition(version /* data */, HEADER_DICTIONARY_VERSION_SIZE, writingPos); default: @@ -142,7 +158,8 @@ typedef DictionaryHeaderStructurePolicy::AttributeMap AttributeMap; } /* static */ void HeaderReadWriteUtils::setCodePointVectorAttribute( - AttributeMap *const headerAttributes, const char *const key, const std::vector<int> value) { + AttributeMap *const headerAttributes, const char *const key, + const std::vector<int> &value) { AttributeMap::key_type keyVector; insertCharactersIntoVector(key, &keyVector); (*headerAttributes)[keyVector] = value; diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/dictionary/header/header_read_write_utils.h index 9b90488fc..f67d614df 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h +++ b/native/jni/src/dictionary/header/header_read_write_utils.h @@ -20,8 +20,8 @@ #include <cstdint> #include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/utils/format_utils.h" namespace latinime { @@ -46,6 +46,9 @@ class HeaderReadWriteUtils { static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + static const int *readCodePointTable( + DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes); + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, int *const writingPos); @@ -64,7 +67,7 @@ class HeaderReadWriteUtils { */ static void setCodePointVectorAttribute( DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, - const char *const key, const std::vector<int> value); + const char *const key, const std::vector<int> &value); static void setBoolAttribute( DictionaryHeaderStructurePolicy::AttributeMap *const headerAttributes, @@ -101,6 +104,8 @@ class HeaderReadWriteUtils { static const int HEADER_FLAG_SIZE; static const int HEADER_SIZE_FIELD_SIZE; + static const char *const CODE_POINT_TABLE_KEY; + // Value for the "flags" field. It's unused at the moment. static const DictionaryFlags NO_FLAGS; diff --git a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h index aa0d068aa..aa0d068aa 100644 --- a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h +++ b/native/jni/src/dictionary/interface/dictionary_bigrams_structure_policy.h diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h index 6da390e55..6da390e55 100644 --- a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h +++ b/native/jni/src/dictionary/interface/dictionary_header_structure_policy.h diff --git a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h index 40b6c2de1..40b6c2de1 100644 --- a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h +++ b/native/jni/src/dictionary/interface/dictionary_shortcuts_structure_policy.h diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h index e91f07682..ace48491d 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/dictionary/interface/dictionary_structure_with_buffer_policy.h @@ -20,16 +20,20 @@ #include <memory> #include "defines.h" -#include "suggest/core/dictionary/property/word_property.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/property/word_property.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "utils/int_array_view.h" namespace latinime { class DicNode; class DicNodeVector; class DictionaryHeaderStructurePolicy; -class DictionaryShortcutsStructurePolicy; +class MultiBigramMap; class NgramListener; -class PrevWordsInfo; +class NgramContext; class UnigramProperty; /* @@ -47,42 +51,45 @@ class DictionaryStructureWithBufferPolicy { virtual void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const = 0; - virtual int getCodePointsAndProbabilityAndReturnCodePointCount( - const int nodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const = 0; + virtual int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const = 0; - virtual int getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const = 0; + virtual int getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const = 0; - virtual int getProbability(const int unigramProbability, - const int bigramProbability) const = 0; + virtual const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const = 0; - virtual int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, - const int nodePos) const = 0; + // TODO: Remove + virtual int getProbability(const int unigramProbability, const int bigramProbability) const = 0; - virtual void iterateNgramEntries(const int *const prevWordsPtNodePos, + virtual int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const = 0; + + virtual void iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const = 0; - virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0; + virtual BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const = 0; virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; - virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0; - // Returns whether the update was success or not. - virtual bool addUnigramEntry(const int *const word, const int length, + virtual bool addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty) = 0; // Returns whether the update was success or not. - virtual bool removeUnigramEntry(const int *const word, const int length) = 0; + virtual bool removeUnigramEntry(const CodePointArrayView wordCodePoints) = 0; + + // Returns whether the update was success or not. + virtual bool addNgramEntry(const NgramProperty *const ngramProperty) = 0; // Returns whether the update was success or not. - virtual bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty) = 0; + virtual bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) = 0; // Returns whether the update was success or not. - virtual bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const int *const word, const int length) = 0; + virtual bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) = 0; // Returns whether the flush was success or not. virtual bool flush(const char *const filePath) = 0; @@ -97,9 +104,7 @@ class DictionaryStructureWithBufferPolicy { virtual void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength) = 0; - // Used for testing. - virtual const WordProperty getWordProperty(const int *const codePonts, - const int codePointCount) const = 0; + virtual const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const = 0; // Method to iterate all words in the dictionary. // The returned token has to be used to get the next word. If token is 0, this method newly diff --git a/native/jni/src/suggest/core/dictionary/ngram_listener.h b/native/jni/src/dictionary/interface/ngram_listener.h index 88b88bafb..2eb5e9fd1 100644 --- a/native/jni/src/suggest/core/dictionary/ngram_listener.h +++ b/native/jni/src/dictionary/interface/ngram_listener.h @@ -26,7 +26,9 @@ namespace latinime { */ class NgramListener { public: - virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos) = 0; + // ngramProbability is always 0 for v403 decaying dictionary. + // TODO: Remove ngramProbability. + virtual void onVisitEntry(const int ngramProbability, const int targetWordId) = 0; virtual ~NgramListener() {}; protected: diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h b/native/jni/src/dictionary/property/historical_info.h index 428ca8626..e5ce1ea25 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/historical_info.h +++ b/native/jni/src/dictionary/property/historical_info.h @@ -34,10 +34,11 @@ class HistoricalInfo { return mTimestamp != NOT_A_TIMESTAMP; } - int getTimeStamp() const { + int getTimestamp() const { return mTimestamp; } + // TODO: Remove int getLevel() const { return mLevel; } @@ -47,7 +48,7 @@ class HistoricalInfo { } private: - // Copy constructor is public to use this class as a type of return value. + // Default copy constructor is used for using in std::vector. DISALLOW_ASSIGNMENT_OPERATOR(HistoricalInfo); const int mTimestamp; diff --git a/native/jni/src/dictionary/property/ngram_context.cpp b/native/jni/src/dictionary/property/ngram_context.cpp new file mode 100644 index 000000000..7b9c3eff6 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_context.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/property/ngram_context.h" + +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "utils/char_utils.h" + +namespace latinime { + +NgramContext::NgramContext() : mPrevWordCount(0) {} + +NgramContext::NgramContext(const NgramContext &ngramContext) + : mPrevWordCount(ngramContext.mPrevWordCount) { + for (size_t i = 0; i < mPrevWordCount; ++i) { + mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i]; + memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); + mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount) + : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) { + clear(); + for (size_t i = 0; i < mPrevWordCount; ++i) { + if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { + continue; + } + memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); + mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; + mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; + } +} + +NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence) : mPrevWordCount(1) { + clear(); + if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { + return; + } + memmove(mPrevWordCodePoints[0], prevWordCodePoints, + sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); + mPrevWordCodePointCount[0] = prevWordCodePointCount; + mIsBeginningOfSentence[0] = isBeginningOfSentence; +} + +bool NgramContext::isValid() const { + if (mPrevWordCodePointCount[0] > 0) { + return true; + } + if (mIsBeginningOfSentence[0]) { + return true; + } + return false; +} + +const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return CodePointArrayView(); + } + return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]); +} + +bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const { + if (n <= 0 || n > mPrevWordCount) { + return false; + } + return mIsBeginningOfSentence[n - 1]; +} + +/* static */ int NgramContext::getWordId( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { + if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { + return NOT_A_WORD_ID; + } + int codePoints[MAX_WORD_LENGTH]; + int codePointCount = wordCodePointCount; + memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); + if (isBeginningOfSentence) { + codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount, + MAX_WORD_LENGTH); + if (codePointCount <= 0) { + return NOT_A_WORD_ID; + } + } + const CodePointArrayView codePointArrayView(codePoints, codePointCount); + const int wordId = dictStructurePolicy->getWordId(codePointArrayView, + false /* forceLowerCaseSearch */); + if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) { + // Return the id when when the word was found or doesn't try lower case search. + return wordId; + } + // Check bigrams for lower-cased previous word if original was not found. Useful for + // auto-capitalized words like "The [current_word]". + return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */); +} + +void NgramContext::clear() { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + mPrevWordCodePointCount[i] = 0; + mIsBeginningOfSentence[i] = false; + } +} +} // namespace latinime diff --git a/native/jni/src/dictionary/property/ngram_context.h b/native/jni/src/dictionary/property/ngram_context.h new file mode 100644 index 000000000..9b36199c9 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_context.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_CONTEXT_H +#define LATINIME_NGRAM_CONTEXT_H + +#include <array> + +#include "defines.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicy; + +class NgramContext { + public: + // No prev word information. + NgramContext(); + // Copy constructor to use this class with std::vector and use this class as a return value. + NgramContext(const NgramContext &ngramContext); + // Construct from previous words. + NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount); + // Construct from a previous word. + NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount, + const bool isBeginningOfSentence); + + size_t getPrevWordCount() const { + return mPrevWordCount; + } + bool isValid() const; + + template<size_t N> + const WordIdArrayView getPrevWordIds( + const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + WordIdArray<N> *const prevWordIdBuffer, const bool tryLowerCaseSearch) const { + for (size_t i = 0; i < std::min(mPrevWordCount, N); ++i) { + prevWordIdBuffer->at(i) = getWordId(dictStructurePolicy, mPrevWordCodePoints[i], + mPrevWordCodePointCount[i], mIsBeginningOfSentence[i], tryLowerCaseSearch); + } + return WordIdArrayView::fromArray(*prevWordIdBuffer).limit(mPrevWordCount); + } + + // n is 1-indexed. + const CodePointArrayView getNthPrevWordCodePoints(const size_t n) const; + // n is 1-indexed. + bool isNthPrevWordBeginningOfSentence(const size_t n) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(NgramContext); + + static int getWordId(const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, + const int *const wordCodePoints, const int wordCodePointCount, + const bool isBeginningOfSentence, const bool tryLowerCaseSearch); + void clear(); + + const size_t mPrevWordCount; + int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_CONTEXT_H diff --git a/native/jni/src/dictionary/property/ngram_property.h b/native/jni/src/dictionary/property/ngram_property.h new file mode 100644 index 000000000..5f259ec59 --- /dev/null +++ b/native/jni/src/dictionary/property/ngram_property.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_PROPERTY_H +#define LATINIME_NGRAM_PROPERTY_H + +#include <vector> + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_context.h" + +namespace latinime { + +class NgramProperty { + public: + NgramProperty(const NgramContext &ngramContext, const std::vector<int> &&targetCodePoints, + const int probability, const HistoricalInfo historicalInfo) + : mNgramContext(ngramContext), mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability), mHistoricalInfo(historicalInfo) {} + + const NgramContext *getNgramContext() const { + return &mNgramContext; + } + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(NgramProperty); + DISALLOW_ASSIGNMENT_OPERATOR(NgramProperty); + + const NgramContext mNgramContext; + const std::vector<int> mTargetCodePoints; + const int mProbability; + const HistoricalInfo mHistoricalInfo; +}; +} // namespace latinime +#endif // LATINIME_NGRAM_PROPERTY_H diff --git a/native/jni/src/dictionary/property/unigram_property.h b/native/jni/src/dictionary/property/unigram_property.h new file mode 100644 index 000000000..92f61b85d --- /dev/null +++ b/native/jni/src/dictionary/property/unigram_property.h @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_UNIGRAM_PROPERTY_H +#define LATINIME_UNIGRAM_PROPERTY_H + +#include <vector> + +#include "defines.h" +#include "dictionary/property/historical_info.h" + +namespace latinime { + +class UnigramProperty { + public: + class ShortcutProperty { + public: + ShortcutProperty(const std::vector<int> &&targetCodePoints, const int probability) + : mTargetCodePoints(std::move(targetCodePoints)), + mProbability(probability) {} + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + private: + // Default copy constructor is used for using in std::vector. + DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty); + + const std::vector<int> mTargetCodePoints; + const int mProbability; + }; + + UnigramProperty() + : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), + mIsBlacklisted(false), mIsPossiblyOffensive(false), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo(), mShortcuts() {} + + // In contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which do not support the Blacklisted flag (v2, v4<403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(false), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + // In contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo, const std::vector<ShortcutProperty> &&shortcuts) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts(std::move(shortcuts)) {} + + // Without shortcuts, in contexts which DO support the Blacklisted flag (v403) + UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, + const bool isBlacklisted, const bool isPossiblyOffensive, const int probability, + const HistoricalInfo historicalInfo) + : mRepresentsBeginningOfSentence(representsBeginningOfSentence), + mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mIsPossiblyOffensive(isPossiblyOffensive), mProbability(probability), + mHistoricalInfo(historicalInfo), mShortcuts() {} + + bool representsBeginningOfSentence() const { + return mRepresentsBeginningOfSentence; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool hasShortcuts() const { + return !mShortcuts.empty(); + } + + int getProbability() const { + return mProbability; + } + + const HistoricalInfo getHistoricalInfo() const { + return mHistoricalInfo; + } + + const std::vector<ShortcutProperty> &getShortcuts() const { + return mShortcuts; + } + + private: + // Default copy constructor is used for using as a return value. + DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); + + const bool mRepresentsBeginningOfSentence; + const bool mIsNotAWord; + const bool mIsBlacklisted; + const bool mIsPossiblyOffensive; + const int mProbability; + const HistoricalInfo mHistoricalInfo; + const std::vector<ShortcutProperty> mShortcuts; +}; +} // namespace latinime +#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/dictionary/property/word_attributes.h b/native/jni/src/dictionary/property/word_attributes.h new file mode 100644 index 000000000..5351e7d7d --- /dev/null +++ b/native/jni/src/dictionary/property/word_attributes.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_ATTRIBUTES_H +#define LATINIME_WORD_ATTRIBUTES_H + +#include "defines.h" + +class WordAttributes { + public: + // Invalid word attributes. + WordAttributes() + : mProbability(NOT_A_PROBABILITY), mIsBlacklisted(false), mIsNotAWord(false), + mIsPossiblyOffensive(false) {} + + WordAttributes(const int probability, const bool isBlacklisted, const bool isNotAWord, + const bool isPossiblyOffensive) + : mProbability(probability), mIsBlacklisted(isBlacklisted), mIsNotAWord(isNotAWord), + mIsPossiblyOffensive(isPossiblyOffensive) {} + + int getProbability() const { + return mProbability; + } + + bool isBlacklisted() const { + return mIsBlacklisted; + } + + bool isNotAWord() const { + return mIsNotAWord; + } + + // Whether or not a word is possibly offensive. + // * Static dictionaries <v202, as well as dynamic dictionaries <v403, will set this based on + // whether or not the probability of the word is zero. + // * Static dictionaries >=v203 will set this based on the IS_POSSIBLY_OFFENSIVE PtNode flag. + // * Dynamic dictionaries >=v403 will set this based on the IS_POSSIBLY_OFFENSIVE language model + // flag (the PtNode flag IS_BLACKLISTED is ignored and kept as zero) + // + // See the ::getWordAttributes function for each of these dictionary policies for more details. + bool isPossiblyOffensive() const { + return mIsPossiblyOffensive; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordAttributes); + + int mProbability; + bool mIsBlacklisted; + bool mIsNotAWord; + bool mIsPossiblyOffensive; +}; + + // namespace +#endif /* LATINIME_WORD_ATTRIBUTES_H */ diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/dictionary/property/word_property.h index aa3e0b68a..3028e020a 100644 --- a/native/jni/src/suggest/core/dictionary/property/word_property.h +++ b/native/jni/src/dictionary/property/word_property.h @@ -20,9 +20,9 @@ #include <vector> #include "defines.h" -#include "jni.h" -#include "suggest/core/dictionary/property/bigram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "utils/int_array_view.h" namespace latinime { @@ -31,23 +31,23 @@ class WordProperty { public: // Default constructor is used to create an instance that indicates an invalid word. WordProperty() - : mCodePoints(), mUnigramProperty(), mBigrams() {} + : mCodePoints(), mUnigramProperty(), mNgrams() {} - WordProperty(const std::vector<int> *const codePoints, - const UnigramProperty *const unigramProperty, - const std::vector<BigramProperty> *const bigrams) - : mCodePoints(*codePoints), mUnigramProperty(*unigramProperty), mBigrams(*bigrams) {} + WordProperty(const std::vector<int> &&codePoints, const UnigramProperty &unigramProperty, + const std::vector<NgramProperty> &ngrams) + : mCodePoints(std::move(codePoints)), mUnigramProperty(unigramProperty), + mNgrams(ngrams) {} - void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, - jobject outShortcutTargets, jobject outShortcutProbabilities) const; + const CodePointArrayView getCodePoints() const { + return CodePointArrayView(mCodePoints); + } - const UnigramProperty *getUnigramProperty() const { - return &mUnigramProperty; + const UnigramProperty &getUnigramProperty() const { + return mUnigramProperty; } - const std::vector<BigramProperty> *getBigramProperties() const { - return &mBigrams; + const std::vector<NgramProperty> &getNgramProperties() const { + return mNgrams; } private: @@ -56,7 +56,7 @@ class WordProperty { const std::vector<int> mCodePoints; const UnigramProperty mUnigramProperty; - const std::vector<BigramProperty> mBigrams; + const std::vector<NgramProperty> mNgrams; }; } // namespace latinime #endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt b/native/jni/src/dictionary/structure/backward/v402/Readme.txt index 9e29e836c..9e29e836c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/Readme.txt +++ b/native/jni/src/dictionary/structure/backward/v402/Readme.txt diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp index 3e8e059f2..60749bce6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.cpp @@ -19,18 +19,18 @@ * Do not edit this file other than updating policy's interface. * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp + * dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/core/dictionary/property/bigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { namespace backward { @@ -60,7 +60,7 @@ void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const out } bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { // 1. The word has no bigrams yet. // 2. The word has bigrams, and there is the target in the list. // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. @@ -79,7 +79,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, - bigramProperty); + ngramProperty); // Write an entry. const int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); if (!mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, writingPos)) { @@ -112,7 +112,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &newBigramEntry, bigramProperty); + &newBigramEntry, ngramProperty); if (!mBigramDictContent->writeBigramEntryAtTail(&bigramEntryToWrite)) { return false; } @@ -138,7 +138,7 @@ bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTarget const BigramEntry updatedBigramEntry = originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &updatedBigramEntry, bigramProperty); + &updatedBigramEntry, ngramProperty); return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); } @@ -264,18 +264,17 @@ int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( const BigramEntry *const originalBigramEntry, - const BigramProperty *const bigramProperty) const { + const NgramProperty *const ngramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(), - bigramProperty->getLevel(), bigramProperty->getCount()); + const HistoricalInfo &historicalInfoForUpdate = ngramProperty->getHistoricalInfo(); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(), + originalBigramEntry->getHistoricalInfo(), ngramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); } else { - return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability()); + return originalBigramEntry->updateProbabilityAndGetEntry(ngramProperty->getProbability()); } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h index 50a4c9743..58c88ce8a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h +++ b/native/jni/src/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h @@ -26,8 +26,8 @@ #define LATINIME_BACKWARD_V402_VER4_BIGRAM_LIST_POLICY_H #include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" namespace latinime { namespace backward { @@ -36,7 +36,7 @@ namespace v402 { class BigramDictContent; } // namespace v402 } // namespace backward -class BigramProperty; +class NgramProperty; namespace backward { namespace v402 { } // namespace v402 @@ -64,7 +64,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { } bool addNewEntry(const int terminalId, const int newTargetTerminalId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); bool removeEntry(const int terminalId, const int targetTerminalId); @@ -80,7 +80,7 @@ class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { int *const outTailEntryPos) const; const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, - const BigramProperty *const bigramProperty) const; + const NgramProperty *const ngramProperty) const; bool updateHasNextFlag(const bool hasNext, const int bigramEntryPos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp index e2dd93c5e..7fa85dec2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.cpp @@ -18,12 +18,12 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp + * dictionary/structure/v4/content/bigram_dict_content.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { @@ -65,6 +65,8 @@ const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; if (mHasHistoricalInfo) { + // Hack for better migration. + count += level; const HistoricalInfo historicalInfo(timestamp, level, count); return BigramEntry(hasNext, probability, &historicalInfo, targetTerminalId); } else { @@ -83,10 +85,10 @@ bool BigramDictContent::writeBigramEntryAndAdvancePosition( } if (mHasHistoricalInfo) { const HistoricalInfo *const historicalInfo = bigramEntryToWrite->getHistoricalInfo(); - if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, - historicalInfo->getTimeStamp()); + historicalInfo->getTimestamp()); return false; } if (!bigramListBuffer->writeUintAndAdvancePosition(historicalInfo->getLevel(), diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h index b554e5676..14f334a12 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_dict_content.h @@ -18,17 +18,17 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h + * dictionary/structure/v4/content/bigram_dict_content.h */ #ifndef LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H #define LATINIME_BACKWARD_V402_BIGRAM_DICT_CONTENT_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/content/bigram_entry.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h index 40968b4d8..36ad855ee 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_entry.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/bigram_entry.h @@ -18,15 +18,15 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h + * dictionary/structure/v4/content/bigram_entry.h */ #ifndef LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H #define LATINIME_BACKWARD_V402_BIGRAM_ENTRY_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/historical_info.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h index 0f2f25534..d3b84fa04 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/dict_content.h @@ -18,7 +18,7 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/dict_content.h + * dictionary/structure/v4/content/dict_content.h */ #ifndef LATINIME_BACKWARD_V402_DICT_CONTENT_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp index c671647d4..b167f0ab2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.cpp @@ -18,15 +18,15 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.cpp + * dictionary/structure/v4/content/probability_dict_content.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { @@ -50,7 +50,8 @@ const ProbabilityEntry ProbabilityDictContent::getProbabilityEntry(const int ter Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, &entryPos); const int count = buffer->readUintAndAdvancePosition( Ver4DictConstants::WORD_COUNT_FIELD_SIZE, &entryPos); - const HistoricalInfo historicalInfo(timestamp, level, count); + // Hack for better migration. + const HistoricalInfo historicalInfo(timestamp, level, count + level); return ProbabilityEntry(flags, probability, &historicalInfo); } else { return ProbabilityEntry(flags, probability); @@ -74,8 +75,8 @@ bool ProbabilityDictContent::setProbabilityEntry(const int terminalId, return false; } writingPos += getEntrySize(); - mSize++; } + mSize = terminalId + 1; } return writeEntry(probabilityEntry, entryPos); } @@ -100,7 +101,6 @@ bool ProbabilityDictContent::flushToFile(const char *const dictPath) const { bool ProbabilityDictContent::runGC( const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, const ProbabilityDictContent *const originalProbabilityDictContent) { - mSize = 0; for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); it != terminalIdMap->end(); ++it) { const ProbabilityEntry probabilityEntry = @@ -109,7 +109,6 @@ bool ProbabilityDictContent::runGC( AKLOGE("Cannot set probability entry in runGC. terminalId: %d", it->second); return false; } - mSize++; } return true; } @@ -147,7 +146,7 @@ bool ProbabilityDictContent::writeEntry(const ProbabilityEntry *const probabilit } if (mHasHistoricalInfo) { const HistoricalInfo *const historicalInfo = probabilityEntry->getHistoricalInfo(); - if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimeStamp(), + if (!bufferToWrite->writeUintAndAdvancePosition(historicalInfo->getTimestamp(), Ver4DictConstants::TIME_STAMP_FIELD_SIZE, &writingPos)) { AKLOGE("Cannot write timestamp in probability dict content. pos: %d", writingPos); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h index 3734797d4..464b29f3f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_dict_content.h @@ -18,17 +18,17 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_dict_content.h + * dictionary/structure/v4/content/probability_dict_content.h */ #ifndef LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H #define LATINIME_BACKWARD_V402_PROBABILITY_DICT_CONTENT_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h index 8ccfa33dc..94e36bf51 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/probability_entry.h @@ -18,15 +18,15 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h + * dictionary/structure/v4/content/probability_entry.h */ #ifndef LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H #define LATINIME_BACKWARD_V402_PROBABILITY_ENTRY_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/historical_info.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp index 56bc8b98d..e538a02a1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.cpp @@ -18,12 +18,12 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp + * dictionary/structure/v4/content/shortcut_dict_content.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h index 179cec5bb..3b725e896 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/shortcut_dict_content.h @@ -18,16 +18,16 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h + * dictionary/structure/v4/content/shortcut_dict_content.h */ #ifndef LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H #define LATINIME_BACKWARD_V402_SHORTCUT_DICT_CONTENT_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h index 49f446814..89df2a1e0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/single_dict_content.h @@ -18,18 +18,18 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h + * dictionary/structure/v4/content/single_dict_content.h */ #ifndef LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H #define LATINIME_BACKWARD_V402_SINGLE_DICT_CONTENT_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" #include "utils/byte_array_view.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp index 7c9b4967a..280f0f85a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.cpp @@ -18,10 +18,10 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp + * dictionary/structure/v4/content/sparse_table_dict_content.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h" +#include "dictionary/structure/backward/v402/content/sparse_table_dict_content.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h index 3c626df11..4b5af87ad 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/sparse_table_dict_content.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/sparse_table_dict_content.h @@ -18,19 +18,19 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h + * dictionary/structure/v4/content/sparse_table_dict_content.h */ #ifndef LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H #define LATINIME_BACKWARD_V402_SPARSE_TABLE_DICT_CONTENT_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" +#include "dictionary/structure/backward/v402/content/dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/mmapped_buffer.h" +#include "dictionary/utils/sparse_table.h" #include "utils/byte_array_view.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp index a9f841779..30b72bbd1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.cpp @@ -18,13 +18,13 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp + * dictionary/structure/v4/content/terminal_position_lookup_table.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h index eadfe0faa..641c7496f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h +++ b/native/jni/src/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h @@ -18,7 +18,7 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h + * dictionary/structure/v4/content/terminal_position_lookup_table.h */ #ifndef LATINIME_BACKWARD_V402_TERMINAL_POSITION_LOOKUP_TABLE_H @@ -27,8 +27,8 @@ #include <unordered_map> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/content/single_dict_content.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h index 941fda748..8cda8c5cf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h +++ b/native/jni/src/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h @@ -19,17 +19,17 @@ * Do not edit this file other than updating policy's interface. * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h + * dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h */ #ifndef LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H #define LATINIME_BACKWARD_V402_VER4_SHORTCUT_LIST_POLICY_H #include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp index 3dfbd1c94..4a9704f4d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.cpp @@ -18,18 +18,18 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp + * dictionary/structure/v4/ver4_dict_buffers.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" #include <cerrno> #include <cstring> #include <sys/stat.h> #include <sys/types.h> -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" #include "utils/byte_array_view.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h index e775be52e..0d09fee9a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_buffers.h @@ -18,7 +18,7 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h + * dictionary/structure/v4/ver4_dict_buffers.h */ #ifndef LATINIME_BACKWARD_V402_VER4_DICT_BUFFER_H @@ -27,14 +27,14 @@ #include <memory> #include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/content/bigram_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/shortcut_dict_content.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp index 81d85f495..2948d0716 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.cpp @@ -18,10 +18,10 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp + * dictionary/structure/v4/ver4_dict_constants.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h index 88ebd6a75..15581d852 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_dict_constants.h @@ -18,7 +18,7 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h + * dictionary/structure/v4/ver4_dict_constants.h */ #ifndef LATINIME_BACKWARD_V402_VER4_DICT_CONSTANTS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp index 82399f190..871ef7aaf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.cpp @@ -18,18 +18,19 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp + * dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/content/probability_dict_content.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { namespace backward { @@ -59,8 +60,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce const int parentPos = DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); int codePoints[MAX_WORD_LENGTH]; - const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, mHeaderPolicy->getCodePointTable(), codePoints, &pos); int terminalIdFieldPos = NOT_A_DICT_POS; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; int probability = NOT_A_PROBABILITY; @@ -98,7 +99,7 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce // The destination position is stored at the same place as the parent position. return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); } else { - return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, newSiblingNodePos); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h index 1999a51a6..367d6f9f8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h @@ -18,15 +18,15 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h + * dictionary/structure/v4/ver4_patricia_trie_node_reader.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H #define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_READER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp index 278f2b199..e3ab5ec20 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.cpp @@ -18,23 +18,23 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp + * dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { namespace backward { @@ -232,10 +232,10 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( } bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { - if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewEntry)) { - AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", - sourcePtNodeParams->getTerminalId(), targetPtNodeParam->getTerminalId()); + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { + if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, ngramProperty, outAddedNewEntry)) { + AKLOGE("Cannot add new bigram entry. prevWordId: %d, wordId: %d", + prevWordIds[0], wordId); return false; } const int ptNodePos = @@ -245,7 +245,7 @@ bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds if (!sourcePtNodeParams.hasBigrams()) { // Update has bigrams flag. return updatePtNodeFlags(sourcePtNodeParams.getHeadPos(), - sourcePtNodeParams.isBlacklisted(), sourcePtNodeParams.isNotAWord(), + sourcePtNodeParams.isPossiblyOffensive(), sourcePtNodeParams.isNotAWord(), sourcePtNodeParams.isTerminal(), sourcePtNodeParams.hasShortcutTargets(), true /* hasBigrams */, sourcePtNodeParams.getCodePointCount() > 1 /* hasMultipleChars */); @@ -310,13 +310,13 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN const int shortcutProbability) { if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), targetCodePoints, targetCodePointCount, shortcutProbability)) { - AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); return false; } if (!ptNodeParams->hasShortcutTargets()) { // Update has shortcut targets flag. return updatePtNodeFlags(ptNodeParams->getHeadPos(), - ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), + ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), true /* hasShortcutTargets */, ptNodeParams->hasBigrams(), ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); @@ -330,7 +330,7 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeHasBigramsAndShortcutTargetsFlags( ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; const bool hasShortcutTargets = mBuffers->getShortcutDictContent()->getShortcutListHeadPos( ptNodeParams->getTerminalId()) != NOT_A_DICT_POS; - return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isBlacklisted(), + return updatePtNodeFlags(ptNodeParams->getHeadPos(), ptNodeParams->isPossiblyOffensive(), ptNodeParams->isNotAWord(), ptNodeParams->isTerminal(), hasShortcutTargets, hasBigrams, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); } @@ -386,8 +386,9 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { return false; } - return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), - isTerminal, ptNodeParams->hasShortcutTargets(), ptNodeParams->hasBigrams(), + return updatePtNodeFlags(nodePos, ptNodeParams->isPossiblyOffensive(), + ptNodeParams->isNotAWord(), isTerminal, ptNodeParams->hasShortcutTargets(), + ptNodeParams->hasBigrams(), ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); } @@ -396,8 +397,7 @@ const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( const UnigramProperty *const unigramProperty) const { // TODO: Consolidate historical info and probability. if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), - unigramProperty->getLevel(), unigramProperty->getCount()); + const HistoricalInfo &historicalInfoForUpdate = unigramProperty->getHistoricalInfo(); const HistoricalInfo updatedHistoricalInfo = ForgettingCurveUtils::createUpdatedHistoricalInfo( originalProbabilityEntry->getHistoricalInfo(), @@ -425,6 +425,18 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, return true; } +bool Ver4PatriciaTrieNodeWriter::suppressUnigramEntry(const PtNodeParams *const ptNodeParams) { + if (!mHeaderPolicy->hasHistoricalInfoOfWords()) { + // Require historical info to suppress unigram entry. + return false; + } + const HistoricalInfo suppressedHistorycalInfo(0 /* timestamp */, 0 /* level */, 0 /* count */); + const ProbabilityEntry probabilityEntryToWrite = + ProbabilityEntry().createEntryWithUpdatedHistoricalInfo(&suppressedHistorycalInfo); + return mBuffers->getMutableProbabilityDictContent()->setProbabilityEntry( + ptNodeParams->getTerminalId(), &probabilityEntryToWrite); +} + } // namespace v402 } // namespace backward } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h index d49d9a666..db3cea174 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h @@ -18,17 +18,17 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h + * dictionary/structure/v4/ver4_patricia_trie_node_writer.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H #define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_NODE_WRITER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/probability_entry.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/backward/v402/content/probability_entry.h" #include "utils/int_array_view.h" namespace latinime { @@ -94,7 +94,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); @@ -111,6 +111,11 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { bool updatePtNodeHasBigramsAndShortcutTargetsFlags(const PtNodeParams *const ptNodeParams); + // Suppress unigram not to use the word for generating suggestions. So, this method can be used + // only for dictionaries with historical info. Also, suppressed entries are included in unigram + // count. They will be removed from the dictionary during GC. + bool suppressUnigramEntry(const PtNodeParams *const ptNodeParams); + private: DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeWriter); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp index 1296b8acd..6fb9cffb7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.cpp @@ -19,24 +19,25 @@ * Do not edit this file other than updating policy's interface. * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp + * dictionary/structure/v4/ver4_patricia_trie_policy.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" #include <vector> #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/bigram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/session/prev_words_info.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" namespace latinime { namespace backward { @@ -51,6 +52,7 @@ const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_C const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; +const int Ver4PatriciaTriePolicy::DUMMY_PROBABILITY_FOR_VALID_WORDS = 1; void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { @@ -76,12 +78,9 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d // Skip PtNodes that represent non-word information. continue; } - childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), - ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, - ptNodeParams.hasChildren(), - ptNodeParams.isBlacklisted() - || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, - ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); + const int wordId = isTerminal ? ptNodeParams.getHeadPos() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); } if (readingHelper.isError()) { mIsCorrupted = true; @@ -89,13 +88,13 @@ void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const d } } -int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const { +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); readingHelper.initWithPtNodePos(ptNodePos); - const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( - maxCodePointCount, outCodePoints, outUnigramProbability); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); if (readingHelper.isError()) { mIsCorrupted = true; AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); @@ -103,72 +102,143 @@ int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( return codePointCount; } -int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const { +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = - readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); if (readingHelper.isError()) { mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); } - return ptNodePos; + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int probability = getProbabilityOfWord(prevWordIds, wordId); + if (probability != NOT_A_PROBABILITY) { + return getWordAttributes(probability, ptNodeParams); + } + } + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.getProbability() == 0); } int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, const int bigramProbability) const { - if (mHeaderPolicy->isDecayingDict()) { - // Both probabilities are encoded. Decode them and get probability. - return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); - } else { - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); - } else { - return bigramProbability; - } + // In the v4 format, bigramProbability is a conditional probability. + const int bigramConditionalProbability = bigramProbability; + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; } + if (bigramConditionalProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } + return bigramConditionalProbability; } -int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, - const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { return NOT_A_PROBABILITY; } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { + if (ptNodeParams.isDeleted() || ptNodeParams.isNotAWord()) { return NOT_A_PROBABILITY; } - if (prevWordsPtNodePos) { - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == ptNodePos - && bigramsIt.getProbability() != NOT_A_PROBABILITY) { - return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); - } - } + if (prevWordIds.empty()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + if (prevWordIds[0] == NOT_A_WORD_ID) { return NOT_A_PROBABILITY; } - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { + return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); + } + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); + BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == ptNodePos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), + bigramsIt.getProbability()); + return getProbability(ptNodeParams.getProbability(), bigramConditionalProbability); + } + } + return NOT_A_PROBABILITY; } -void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const { - if (!prevWordsPtNodePos) { + if (prevWordIds.firstOrDefault(NOT_A_DICT_POS) == NOT_A_DICT_POS) { + return; + } + const PtNodeParams prevWordPtNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(prevWordIds[0]); + if (prevWordPtNodeParams.isDeleted()) { return; } - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); + const int bigramsPosition = mBuffers->getBigramDictContent()->getBigramListHeadPos( + prevWordPtNodeParams.getTerminalId()); BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); while (bigramsIt.hasNext()) { bigramsIt.next(); - listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); + const int bigramConditionalProbability = getBigramConditionalProbability( + prevWordPtNodeParams.getProbability(), + prevWordPtNodeParams.representsBeginningOfSentence(), bigramsIt.getProbability()); + listener->onVisitEntry(bigramConditionalProbability, + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); } } +int Ver4PatriciaTriePolicy::getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const { + if (mHeaderPolicy->hasHistoricalInfoOfWords()) { + if (isInBeginningOfSentenceContext) { + return bigramProbability; + } + // Calculate conditional probability. + return std::min(MAX_PROBABILITY - prevWordUnigramProbability + bigramProbability, + MAX_PROBABILITY); + } else { + // bigramProbability is a conditional probability. + return bigramProbability; + } +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; @@ -193,7 +263,7 @@ int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) cons ptNodeParams.getTerminalId()); } -bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); @@ -204,13 +274,14 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le mDictBuffer->getTailPosition()); return false; } - if (length > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert to the dictionary, length: %d", length); + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); return false; } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", shortcut.getTargetCodePoints()->size()); return false; } @@ -219,8 +290,8 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le readingHelper.initWithPtNodeArrayPos(getRootPosition()); bool addedNewUnigram = false; int codePointsToAdd[MAX_WORD_LENGTH]; - int codePointCountToAdd = length; - memmove(codePointsToAdd, word, sizeof(int) * length); + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); if (unigramProperty->representsBeginningOfSentence()) { codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, codePointCountToAdd, MAX_WORD_LENGTH); @@ -228,24 +299,25 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le if (codePointCountToAdd <= 0) { return false; } - if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, - unigramProperty, &addedNewUnigram)) { + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mUnigramCount++; + mEntryCounters.incrementNgramCount(NgramType::Unigram); } if (unigramProperty->getShortcuts().size() > 0) { // Add shortcut target. - const int wordPos = getTerminalPtNodePositionOfWord(word, length, - false /* forceLowerCaseSearch */); + const int wordPos = getTerminalPtNodePosFromWordId( + getWordId(codePointArrayView, false /* forceLowerCaseSearch */)); if (wordPos == NOT_A_DICT_POS) { AKLOGE("Cannot find terminal PtNode position to add shortcut target."); return false; } for (const auto &shortcut : unigramProperty->getShortcuts()) { if (!mUpdatingHelper.addShortcutTarget(wordPos, - shortcut.getTargetCodePoints()->data(), - shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), shortcut.getProbability()); return false; @@ -258,8 +330,21 @@ bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int le } } -bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty) { +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); + if (ptNodePos == NOT_A_DICT_POS) { + return false; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + return mNodeWriter.suppressUnigramEntry(&ptNodeParams); +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; @@ -269,50 +354,50 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI mDictBuffer->getTailPosition()); return false; } - if (!prevWordsInfo->isValid()) { - AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); return false; } - if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %d", bigramProperty->getTargetCodePoints()->size()); + "length: %zd", ngramProperty->getTargetCodePoints()->size()); return false; } - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); - // TODO: Support N-gram. - if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { - if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { - const std::vector<UnigramProperty::ShortcutProperty> shortcuts; + if (prevWordIds.empty()) { + return false; + } + if (prevWordIds[0] == NOT_A_WORD_ID) { + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { const UnigramProperty beginningOfSentenceUnigramProperty( true /* representsBeginningOfSentence */, true /* isNotAWord */, - false /* isBlacklisted */, MAX_PROBABILITY /* probability */, - NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); - if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), - prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), + false /* isBlacklisted */, MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), &beginningOfSentenceUnigramProperty)) { AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); return false; } - // Refresh Terminal PtNode positions. - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, - false /* tryLowerCaseSearch */); + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); } else { return false; } } - const int word1Pos = getTerminalPtNodePositionOfWord( - bigramProperty->getTargetCodePoints()->data(), - bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); - if (word1Pos == NOT_A_DICT_POS) { + const int wordPos = getTerminalPtNodePosFromWordId(getWordId( + CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */)); + if (wordPos == NOT_A_DICT_POS) { return false; } bool addedNewBigram = false; - if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::fromObject(prevWordsPtNodePos), - word1Pos, bigramProperty, &addedNewBigram)) { + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); + if (mUpdatingHelper.addNgramEntry(PtNodePosArrayView::singleElementView(&prevWordPtNodePos), + wordPos, ngramProperty, &addedNewBigram)) { if (addedNewBigram) { - mBigramCount++; + mEntryCounters.incrementNgramCount(NgramType::Bigram); } return true; } else { @@ -320,8 +405,8 @@ bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsI } } -bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const int *const word, const int length) { +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); return false; @@ -331,40 +416,68 @@ bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWor mDictBuffer->getTailPosition()); return false; } - if (!prevWordsInfo->isValid()) { - AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); return false; } - if (length > MAX_WORD_LENGTH) { - AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); } - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSerch */); - // TODO: Support N-gram. - if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { return false; } - const int wordPos = getTerminalPtNodePositionOfWord(word, length, - false /* forceLowerCaseSearch */); + const int wordPos = getTerminalPtNodePosFromWordId(getWordId(wordCodePoints, + false /* forceLowerCaseSearch */)); if (wordPos == NOT_A_DICT_POS) { return false; } + const int prevWordPtNodePos = getTerminalPtNodePosFromWordId(prevWordIds[0]); if (mUpdatingHelper.removeNgramEntry( - PtNodePosArrayView::fromObject(prevWordsPtNodePos), wordPos)) { - mBigramCount--; + PtNodePosArrayView::singleElementView(&prevWordPtNodePos), wordPos)) { + mEntryCounters.decrementNgramCount(NgramType::Bigram); return true; } else { return false; } } + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const int probability = isValidWord ? DUMMY_PROBABILITY_FOR_VALID_WORDS : NOT_A_PROBABILITY; + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /*isBlacklisted*/, probability, historicalInfo); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + const int probabilityForNgram = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) + ? NOT_A_PROBABILITY : probability; + const NgramProperty ngramProperty(*ngramContext, wordCodePoints.toVector(), probabilityForNgram, + historicalInfo); + if (!addNgramEntry(&ngramProperty)) { + AKLOGE("Cannot update unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + return true; +} + bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { if (!mBuffers->isUpdatable()) { AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); return false; } - if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { AKLOGE("Cannot flush the dictionary to file."); mIsCorrupted = true; return false; @@ -402,7 +515,7 @@ bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { // Needs to reduce dictionary size. return true; } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), mHeaderPolicy); } return false; @@ -412,41 +525,42 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer char *const outResult, const int maxResultLength) { const int compareLength = queryLength + 1 /* terminator */; if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mUnigramCount); + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mBigramCount); + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getUnigramCountHardLimit( - mHeaderPolicy->getMaxUnigramCount()) : + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { snprintf(outResult, maxResultLength, "%d", mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getBigramCountHardLimit( - mHeaderPolicy->getMaxBigramCount()) : + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); } } -const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, - const int codePointCount) const { - const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, - false /* forceLowerCaseSearch */); +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int ptNodePos = getTerminalPtNodePosFromWordId( + getWordId(wordCodePoints, false /* forceLowerCaseSearch */)); if (ptNodePos == NOT_A_DICT_POS) { AKLOGE("getWordProperty is called for invalid word."); return WordProperty(); } const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - std::vector<int> codePointVector(ptNodeParams.getCodePoints(), - ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); const ProbabilityEntry probabilityEntry = mBuffers->getProbabilityDictContent()->getProbabilityEntry( ptNodeParams.getTerminalId()); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); // Fetch bigram information. - std::vector<BigramProperty> bigrams; + std::vector<NgramProperty> ngrams; const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); if (bigramListPos != NOT_A_DICT_POS) { int bigramWord1CodePoints[MAX_WORD_LENGTH]; @@ -465,21 +579,21 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code if (word1TerminalPtNodePos == NOT_A_DICT_POS) { continue; } - // Word (unigram) probability - int word1Probability = NOT_A_PROBABILITY; - const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, - &word1Probability); - const std::vector<int> word1(bigramWord1CodePoints, - bigramWord1CodePoints + codePointCount); + const int codePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(word1TerminalPtNodePos), MAX_WORD_LENGTH, + bigramWord1CodePoints); const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); - const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - bigramEntry.getHistoricalInfo(), mHeaderPolicy) : - bigramEntry.getProbability(); - bigrams.emplace_back(&word1, probability, - historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount()); + const int rawBigramProbability = bigramEntry.hasHistoricalInfo() + ? ForgettingCurveUtils::decodeProbability( + bigramEntry.getHistoricalInfo(), mHeaderPolicy) + : bigramEntry.getProbability(); + const int probability = getBigramConditionalProbability(ptNodeParams.getProbability(), + ptNodeParams.representsBeginningOfSentence(), rawBigramProbability); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, codePointCount).toVector(), + probability, *historicalInfo); } } // Fetch shortcut information. @@ -495,15 +609,15 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const code int shortcutProbability = NOT_A_PROBABILITY; shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); - const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); - shortcuts.emplace_back(&target, shortcutProbability); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); } } const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), - historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount(), &shortcuts); - return WordProperty(&codePointVector, &unigramProperty, &bigrams); + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); } int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, @@ -524,9 +638,8 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const return 0; } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - int unigramProbability = NOT_A_PROBABILITY; - *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. @@ -536,6 +649,14 @@ int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const return nextToken; } +int Ver4PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int Ver4PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + } // namespace v402 } // namespace backward } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h index 9e989b268..bce5f6bea 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h @@ -19,7 +19,7 @@ * Do not edit this file other than updating policy's interface. * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h + * dictionary/structure/v4/ver4_patricia_trie_policy.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_POLICY_H @@ -28,17 +28,21 @@ #include <vector> #include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" namespace latinime { namespace backward { @@ -55,6 +59,8 @@ class DicNodeVector; namespace backward { namespace v402 { +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) @@ -70,54 +76,50 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), - mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; - AK_FORCE_INLINE int getRootPosition() const { + virtual int getRootPosition() const { return 0; } void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const; - int getCodePointsAndProbabilityAndReturnCodePointCount( - const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const; + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; - int getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const; + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; + + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; int getProbability(const int unigramProbability, const int bigramProbability) const; - int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const; + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - void iterateNgramEntries(const int *const prevWordsPtNodePos, + void iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const; - int getShortcutPositionOfPtNode(const int ptNodePos) const; + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { return mHeaderPolicy; } - const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { - return &mShortcutPolicy; - } - - bool addUnigramEntry(const int *const word, const int length, + bool addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty); - bool removeUnigramEntry(const int *const word, const int length) { - // Removing unigram entry is not supported. - return false; - } + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); - bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty); + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); - bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, - const int length1); + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); bool flush(const char *const filePath); @@ -128,8 +130,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); - const WordProperty getWordProperty(const int *const codePoints, - const int codePointCount) const; + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; int getNextWordAndNextToken(const int token, int *const outCodePoints, int *const outCodePointCount); @@ -149,6 +150,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { // prevent the dictionary from overflowing. static const int MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; static const int MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS; + static const int DUMMY_PROBABILITY_FOR_VALID_WORDS; const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; const HeaderPolicy *const mHeaderPolicy; @@ -160,12 +162,18 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { Ver4PatriciaTrieNodeWriter mNodeWriter; DynamicPtUpdatingHelper mUpdatingHelper; Ver4PatriciaTrieWritingHelper mWritingHelper; - int mUnigramCount; - int mBigramCount; + MutableEntryCounters mEntryCounters; std::vector<int> mTerminalPtNodePositionsForIteratingWords; mutable bool mIsCorrupted; int getBigramsPositionOfPtNode(const int ptNodePos) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + int getBigramConditionalProbability(const int prevWordUnigramProbability, + const bool isInBeginningOfSentenceContext, const int bigramProbability) const; }; } // namespace v402 } // namespace backward diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp index 80d531198..b8a4cf847 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.cpp @@ -18,12 +18,12 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h index 3579c26d6..c3e736bdc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_reading_utils.h @@ -18,7 +18,7 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h + * dictionary/structure/v4/ver4_patricia_trie_reading_utils.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_READING_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp index 3fb4caa08..c0af9eae6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.cpp @@ -18,43 +18,43 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h" #include <cstring> #include <queue> -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/bigram/ver4_bigram_list_policy.h" +#include "dictionary/structure/backward/v402/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { namespace backward { namespace v402 { bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const int unigramCount, const int bigramCount) const { + const EntryCounts &entryCounts) const { const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + entryCounts, extendedRegionSize, &headerBuffer)) { AKLOGE("Cannot write header structure to buffer. " "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, unigramCount, bigramCount, - extendedRegionSize); + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), extendedRegionSize); return false; } return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); @@ -73,8 +73,11 @@ bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr } BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + MutableEntryCounters entryCounters; + entryCounters.setNgramCount(NgramType::Unigram, unigramCount); + entryCounters.setNgramCount(NgramType::Bigram, bigramCount); if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { return false; } return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); @@ -106,7 +109,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, } const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted .getValidUnigramCount(); - const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); + const int maxUnigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Unigram); if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, @@ -123,7 +126,7 @@ bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, return false; } const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); - const int maxBigramCount = headerPolicy->getMaxBigramCount(); + const int maxBigramCount = headerPolicy->getMaxNgramCounts().getNgramCount(NgramType::Bigram); if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { if (!truncateBigrams(maxBigramCount)) { AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); @@ -216,7 +219,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : probabilityEntry.getProbability(); priorityQueue.push(DictProbability(terminalPos, probability, - probabilityEntry.getHistoricalInfo()->getTimeStamp())); + probabilityEntry.getHistoricalInfo()->getTimestamp())); } // Delete unigrams. @@ -263,7 +266,7 @@ bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : bigramEntry.getProbability(); priorityQueue.push(DictProbability(entryPos, probability, - bigramEntry.getHistoricalInfo()->getTimeStamp())); + bigramEntry.getHistoricalInfo()->getTimestamp())); } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h index 9034ee656..f2b873826 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_patricia_trie_writing_helper.h @@ -18,15 +18,16 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h + * dictionary/structure/v4/ver4_patricia_trie_writing_helper.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H #define LATINIME_BACKWARD_V402_VER4_PATRICIA_TRIE_WRITING_HELPER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/backward/v402/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" namespace latinime { namespace backward { @@ -46,8 +47,7 @@ class Ver4PatriciaTrieWritingHelper { Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) : mBuffers(buffers) {} - bool writeToDictFile(const char *const dictDirPath, const int unigramCount, - const int bigramCount) const; + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; // This method cannot be const because the original dictionary buffer will be updated to detect // useless PtNodes during GC. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp index 537a6d420..d27d70816 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.cpp @@ -18,14 +18,14 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp + * dictionary/structure/v4/ver4_pt_node_array_reader.cpp */ -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" +#include "dictionary/structure/backward/v402/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h index 4f8056801..0039bf8fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h +++ b/native/jni/src/dictionary/structure/backward/v402/ver4_pt_node_array_reader.h @@ -18,14 +18,14 @@ * !!!!! DO NOT EDIT THIS FILE !!!!! * * This file was generated from - * suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h + * dictionary/structure/v4/ver4_pt_node_array_reader.h */ #ifndef LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H #define LATINIME_BACKWARD_V402_VER4_PT_NODE_ARRAY_READER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" namespace latinime { namespace backward { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp index e4ea3da16..4470e8568 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp +++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.cpp @@ -14,23 +14,23 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" +#include "dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" #include <climits> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/backward/v402/ver4_dict_constants.h" +#include "dictionary/structure/backward/v402/ver4_patricia_trie_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v2/patricia_trie_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" #include "utils/byte_array_view.h" namespace latinime { @@ -58,7 +58,7 @@ namespace latinime { const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { FormatUtils::FORMAT_VERSION dictFormatVersion = FormatUtils::getFormatVersion(formatVersion); switch (dictFormatVersion) { - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_402: { return newPolicyForOnMemoryV4Dict<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, @@ -66,7 +66,7 @@ namespace latinime { dictFormatVersion, locale, attributeMap); } case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: { + case FormatUtils::VERSION_403: { return newPolicyForOnMemoryV4Dict<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( dictFormatVersion, locale, attributeMap); @@ -111,13 +111,14 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str return nullptr; } const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::detectFormatVersion( - mmappedBuffer->getReadOnlyByteArrayView().data(), - mmappedBuffer->getReadOnlyByteArrayView().size()); + mmappedBuffer->getReadOnlyByteArrayView()); switch (formatVersion) { case FormatUtils::VERSION_2: - AKLOGE("Given path is a directory but the format is version 2. path: %s", path); + case FormatUtils::VERSION_201: + case FormatUtils::VERSION_202: + AKLOGE("Given path is a directory but the format is version 2xx. path: %s", path); break; - case FormatUtils::VERSION_4: { + case FormatUtils::VERSION_402: { return newPolicyForV4Dict<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr, @@ -125,7 +126,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str headerFilePath, formatVersion, std::move(mmappedBuffer)); } case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: { + case FormatUtils::VERSION_403: { return newPolicyForV4Dict<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr, Ver4PatriciaTriePolicy>( headerFilePath, formatVersion, std::move(mmappedBuffer)); @@ -174,14 +175,17 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr, class Str if (!mmappedBuffer) { return nullptr; } - switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView().data(), - mmappedBuffer->getReadOnlyByteArrayView().size())) { + switch (FormatUtils::detectFormatVersion(mmappedBuffer->getReadOnlyByteArrayView())) { case FormatUtils::VERSION_2: + case FormatUtils::VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + break; + case FormatUtils::VERSION_202: return DictionaryStructureWithBufferPolicy::StructurePolicyPtr( new PatriciaTriePolicy(std::move(mmappedBuffer))); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_402: + case FormatUtils::VERSION_403: AKLOGE("Given path is a file but the format is version 4. path: %s", path); break; default: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h index 768454d8d..b0c04c0b1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h +++ b/native/jni/src/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h @@ -20,10 +20,10 @@ #include <vector> #include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp index f7fd5c071..64f9b6663 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp +++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -39,32 +39,31 @@ const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; /* static */ bool BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( - const uint8_t *const bigramsBuf, const int bufSize, BigramFlags *const outBigramFlags, + const ReadOnlyByteArrayView buffer, BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, int *const bigramEntryPos) { - if (bufSize <= *bigramEntryPos) { - AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %d, " - "bigramEntryPos: %d.", bufSize, *bigramEntryPos); + if (static_cast<int>(buffer.size()) <= *bigramEntryPos) { + AKLOGE("Read invalid pos in getBigramEntryPropertiesAndAdvancePosition(). bufSize: %zd, " + "bigramEntryPos: %d.", buffer.size(), *bigramEntryPos); return false; } - const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), bigramEntryPos); if (outBigramFlags) { *outBigramFlags = bigramFlags; } - const int targetPos = getBigramAddressAndAdvancePosition(bigramsBuf, bigramFlags, - bigramEntryPos); + const int targetPos = getBigramAddressAndAdvancePosition(buffer, bigramFlags, bigramEntryPos); if (outTargetPtNodePos) { *outTargetPtNodePos = targetPos; } return true; } -/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const uint8_t *const bigramsBuf, - const int bufSize, int *const bigramListPos) { +/* static */ bool BigramListReadWriteUtils::skipExistingBigrams(const ReadOnlyByteArrayView buffer, + int *const bigramListPos) { BigramFlags flags; do { - if (!getBigramEntryPropertiesAndAdvancePosition(bigramsBuf, bufSize, &flags, - 0 /* outTargetPtNodePos */, bigramListPos)) { + if (!getBigramEntryPropertiesAndAdvancePosition(buffer, &flags, 0 /* outTargetPtNodePos */, + bigramListPos)) { return false; } } while(hasNext(flags)); @@ -72,18 +71,18 @@ const BigramListReadWriteUtils::BigramFlags } /* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( - const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { + const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos) { int offset = 0; const int origin = *pos; switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: - offset = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); break; case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: - offset = ByteArrayUtils::readUint16AndAdvancePosition(bigramsBuf, pos); + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos); break; case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: - offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer.data(), pos); break; } if (isOffsetNegative(flags)) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h index 10f93fb7a..a0f7d5e83 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h +++ b/native/jni/src/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h @@ -21,6 +21,7 @@ #include <cstdlib> #include "defines.h" +#include "utils/byte_array_view.h" namespace latinime { @@ -30,8 +31,8 @@ class BigramListReadWriteUtils { public: typedef uint8_t BigramFlags; - static bool getBigramEntryPropertiesAndAdvancePosition(const uint8_t *const bigramsBuf, - const int bufSize, BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + static bool getBigramEntryPropertiesAndAdvancePosition(const ReadOnlyByteArrayView buffer, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, int *const bigramEntryPos); static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { @@ -43,8 +44,7 @@ public: } // Bigrams reading methods - static bool skipExistingBigrams(const uint8_t *const bigramsBuf, const int bufSize, - int *const bigramListPos); + static bool skipExistingBigrams(const ReadOnlyByteArrayView buffer, int *const bigramListPos); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); @@ -61,7 +61,7 @@ private: return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; } - static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf, + static int getBigramAddressAndAdvancePosition(const ReadOnlyByteArrayView buffer, const BigramFlags flags, int *const pos); }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp index db1a802d0..b5e2e9dae 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h index 2aa402748..8c7ad965b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h @@ -20,9 +20,9 @@ #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -76,6 +76,7 @@ class DynamicPtGcEventListeners { int mValidUnigramCount; }; + // TODO: Remove when we stop supporting v402 format. // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram // entries. class TraversePolicyToUpdateBigramProbability diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp index 086d98b4a..294bc6ea9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" #include "utils/char_utils.h" namespace latinime { @@ -175,8 +175,8 @@ bool DynamicPtReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFi return !isError(); } -int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( - const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) { +int DynamicPtReadingHelper::getCodePointsAndReturnCodePointCount(const int maxCodePointCount, + int *const outCodePoints) { // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. int reverseCodePoints[maxCodePointCount]; @@ -184,11 +184,8 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( // First, read the terminal node and get its probability. if (!isValidTerminalNode(terminalPtNodeParams)) { // Node at the ptNodePos is not a valid terminal node. - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } - // Store terminal node probability. - *outUnigramProbability = terminalPtNodeParams.getProbability(); // Then, following parent node link to the dictionary root and fetch node code points. int totalCodePointCount = 0; while (!isEnd()) { @@ -196,7 +193,6 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( totalCodePointCount = getTotalCodePointCount(ptNodeParams); if (!ptNodeParams.isValid() || totalCodePointCount > maxCodePointCount) { // The ptNodePos is not a valid terminal node position in the dictionary. - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Store node code points to buffer in the reverse order. @@ -207,7 +203,6 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( } if (isError()) { // The node position or the dictionary is invalid. - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } // Reverse the stored code points to output them. @@ -218,9 +213,9 @@ int DynamicPtReadingHelper::getCodePointsAndProbabilityAndReturnCodePointCount( } int DynamicPtReadingHelper::getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) { + const size_t length, const bool forceLowerCaseSearch) { int searchCodePoints[length]; - for (int i = 0; i < length; ++i) { + for (size_t i = 0; i < length; ++i) { searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; } while (!isEnd()) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h index b7262581a..d8ddc7c2b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_helper.h @@ -21,8 +21,8 @@ #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" namespace latinime { @@ -138,12 +138,12 @@ class DynamicPtReadingHelper { } // Return code point count exclude the last read node's code points. - AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + AK_FORCE_INLINE size_t getPrevTotalCodePointCount() const { return mReadingState.mTotalCodePointCountSinceInitialization; } // Return code point count include the last read node's code points. - AK_FORCE_INLINE int getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { + AK_FORCE_INLINE size_t getTotalCodePointCount(const PtNodeParams &ptNodeParams) const { return mReadingState.mTotalCodePointCountSinceInitialization + ptNodeParams.getCodePointCount(); } @@ -211,10 +211,9 @@ class DynamicPtReadingHelper { bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( TraversingEventListener *const listener); - int getCodePointsAndProbabilityAndReturnCodePointCount(const int maxCodePointCount, - int *const outCodePoints, int *const outUnigramProbability); + int getCodePointsAndReturnCodePointCount(const int maxCodePointCount, int *const outCodePoints); - int getTerminalPtNodePositionOfWord(const int *const inWord, const int length, + int getTerminalPtNodePositionOfWord(const int *const inWord, const size_t length, const bool forceLowerCaseSearch); private: @@ -234,7 +233,7 @@ class DynamicPtReadingHelper { int mPos; // Remaining node count in the current array. int mRemainingPtNodeCountInThisArray; - int mTotalCodePointCountSinceInitialization; + size_t mTotalCodePointCountSinceInitialization; // Counter of PtNodes used to avoid infinite loops caused by broken or malicious links. int mTotalPtNodeIndexInThisArrayChain; // Counter of PtNode arrays used to avoid infinite loops caused by cyclic links of empty diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp index 3586b50ab..3eb55ed9b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.cpp @@ -14,10 +14,10 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h index b13a075d5..b13a075d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_reading_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp index 3c62e2e56..ccad345c8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.cpp @@ -14,31 +14,30 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { const int DynamicPtUpdatingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; -bool DynamicPtUpdatingHelper::addUnigramWord( - DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, - const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram) { +bool DynamicPtUpdatingHelper::addUnigramWord(DynamicPtReadingHelper *const readingHelper, + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram) { int parentPos = NOT_A_DICT_POS; while (!readingHelper->isEnd()) { const PtNodeParams ptNodeParams(readingHelper->getPtNodeParams()); if (!ptNodeParams.isValid()) { break; } - const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + const size_t matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); if (!readingHelper->isMatchedCodePoint(ptNodeParams, 0 /* index */, wordCodePoints[matchedCodePointCount])) { // The first code point is different from target code point. Skip this node and read @@ -47,26 +46,25 @@ bool DynamicPtUpdatingHelper::addUnigramWord( continue; } // Check following merged node code points. - const int nodeCodePointCount = ptNodeParams.getCodePointCount(); - for (int j = 1; j < nodeCodePointCount; ++j) { - const int nextIndex = matchedCodePointCount + j; - if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(ptNodeParams, j, - wordCodePoints[matchedCodePointCount + j])) { + const size_t nodeCodePointCount = ptNodeParams.getCodePointArrayView().size(); + for (size_t j = 1; j < nodeCodePointCount; ++j) { + const size_t nextIndex = matchedCodePointCount + j; + if (nextIndex >= wordCodePoints.size() + || !readingHelper->isMatchedCodePoint(ptNodeParams, j, + wordCodePoints[matchedCodePointCount + j])) { *outAddedNewUnigram = true; return reallocatePtNodeAndAddNewPtNodes(&ptNodeParams, j, unigramProperty, - wordCodePoints + matchedCodePointCount, - codePointCount - matchedCodePointCount); + wordCodePoints.skip(matchedCodePointCount)); } } // All characters are matched. - if (codePointCount == readingHelper->getTotalCodePointCount(ptNodeParams)) { + if (wordCodePoints.size() == readingHelper->getTotalCodePointCount(ptNodeParams)) { return setPtNodeProbability(&ptNodeParams, unigramProperty, outAddedNewUnigram); } if (!ptNodeParams.hasChildren()) { *outAddedNewUnigram = true; return createChildrenPtNodeArrayAndAChildPtNode(&ptNodeParams, unigramProperty, - wordCodePoints + readingHelper->getTotalCodePointCount(ptNodeParams), - codePointCount - readingHelper->getTotalCodePointCount(ptNodeParams)); + wordCodePoints.skip(readingHelper->getTotalCodePointCount(ptNodeParams))); } // Advance to the children nodes. parentPos = ptNodeParams.getHeadPos(); @@ -79,13 +77,12 @@ bool DynamicPtUpdatingHelper::addUnigramWord( int pos = readingHelper->getPosOfLastForwardLinkField(); *outAddedNewUnigram = true; return createAndInsertNodeIntoPtNodeArray(parentPos, - wordCodePoints + readingHelper->getPrevTotalCodePointCount(), - codePointCount - readingHelper->getPrevTotalCodePointCount(), - unigramProperty, &pos); + wordCodePoints.skip(readingHelper->getPrevTotalCodePointCount()), unigramProperty, + &pos); } bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, - const int wordPos, const BigramProperty *const bigramProperty, + const int wordPos, const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) { if (prevWordsPtNodePos.empty()) { return false; @@ -99,7 +96,7 @@ bool DynamicPtUpdatingHelper::addNgramEntry(const PtNodePosArrayView prevWordsPt const WordIdArrayView prevWordIds(prevWordTerminalIds, prevWordsPtNodePos.size()); const int wordId = mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos).getTerminalId(); - return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, bigramProperty, outAddedNewEntry); + return mPtNodeWriter->addNgramEntry(prevWordIds, wordId, ngramProperty, outAddedNewEntry); } bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, @@ -120,23 +117,21 @@ bool DynamicPtUpdatingHelper::removeNgramEntry(const PtNodePosArrayView prevWord } bool DynamicPtUpdatingHelper::addShortcutTarget(const int wordPos, - const int *const targetCodePoints, const int targetCodePointCount, - const int shortcutProbability) { + const CodePointArrayView targetCodePoints, const int shortcutProbability) { const PtNodeParams ptNodeParams(mPtNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(wordPos)); - return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints, targetCodePointCount, - shortcutProbability); + return mPtNodeWriter->addShortcutTarget(&ptNodeParams, targetCodePoints.data(), + targetCodePoints.size(), shortcutProbability); } bool DynamicPtUpdatingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, - const int *const nodeCodePoints, const int nodeCodePointCount, - const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos) { + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, + int *const forwardLinkFieldPos) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, newPtNodeArrayPos, forwardLinkFieldPos)) { return false; } - return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, - unigramProperty); + return createNewPtNodeArrayWithAChildPtNode(parentPos, ptNodeCodePoints, unigramProperty); } bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, @@ -151,10 +146,9 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori const int movedPos = mBuffer->getTailPosition(); int writingPos = movedPos; const PtNodeParams ptNodeParamsToWrite(getUpdatedPtNodeParams(originalPtNodeParams, - unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, originalPtNodeParams->getParentPos(), - originalPtNodeParams->getCodePointCount(), originalPtNodeParams->getCodePoints(), - unigramProperty->getProbability())); + originalPtNodeParams->getCodePointArrayView(), unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { return false; @@ -168,17 +162,17 @@ bool DynamicPtUpdatingHelper::setPtNodeProbability(const PtNodeParams *const ori bool DynamicPtUpdatingHelper::createChildrenPtNodeArrayAndAChildPtNode( const PtNodeParams *const parentPtNodeParams, const UnigramProperty *const unigramProperty, - const int *const codePoints, const int codePointCount) { + const CodePointArrayView codePoints) { const int newPtNodeArrayPos = mBuffer->getTailPosition(); if (!mPtNodeWriter->updateChildrenPosition(parentPtNodeParams, newPtNodeArrayPos)) { return false; } return createNewPtNodeArrayWithAChildPtNode(parentPtNodeParams->getHeadPos(), codePoints, - codePointCount, unigramProperty); + unigramProperty); } bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( - const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, + const int parentPtNodePos, const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty) { int writingPos = mBuffer->getTailPosition(); if (!DynamicPtWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, @@ -186,8 +180,8 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( return false; } const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), true /* isTerminal */, - parentPtNodePos, nodeCodePointCount, nodeCodePoints, + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), + true /* isTerminal */, parentPtNodePos, ptNodeCodePoints, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { @@ -202,9 +196,9 @@ bool DynamicPtUpdatingHelper::createNewPtNodeArrayWithAChildPtNode( // Returns whether the dictionary updating was succeeded or not. bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( - const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, - const int newNodeCodePointCount) { + const PtNodeParams *const reallocatingPtNodeParams, const size_t overlappingCodePointCount, + const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints) { // When addsExtraChild is true, split the reallocating PtNode and add new child. // Reallocating PtNode: abcde, newNode: abcxy. // abc (1st, not terminal) __ de (2nd) @@ -212,25 +206,26 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( // Otherwise, this method makes 1st part terminal and write information in unigramProperty. // Reallocating PtNode: abcde, newNode: abc. // abc (1st, terminal) __ de (2nd) - const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; + const bool addsExtraChild = newPtNodeCodePoints.size() > overlappingCodePointCount; const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); int writingPos = firstPartOfReallocatedPtNodePos; // Write the 1st part of the reallocating node. The children position will be updated later // with actual children position. + const CodePointArrayView firstPtNodeCodePoints = + reallocatingPtNodeParams->getCodePointArrayView().limit(overlappingCodePointCount); if (addsExtraChild) { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - false /* isNotAWord */, false /* isBlacklisted */, false /* isTerminal */, - reallocatingPtNodeParams->getParentPos(), overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints(), NOT_A_PROBABILITY)); + false /* isNotAWord */, false /* isPossiblyOffensive */, false /* isTerminal */, + reallocatingPtNodeParams->getParentPos(), firstPtNodeCodePoints, + NOT_A_PROBABILITY)); if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&ptNodeParamsToWrite, &writingPos)) { return false; } } else { const PtNodeParams ptNodeParamsToWrite(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, reallocatingPtNodeParams->getParentPos(), - overlappingCodePointCount, reallocatingPtNodeParams->getCodePoints(), - unigramProperty->getProbability())); + firstPtNodeCodePoints, unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&ptNodeParamsToWrite, unigramProperty, &writingPos)) { return false; @@ -246,20 +241,19 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( // Write the 2nd part of the reallocating node. const int secondPartOfReallocatedPtNodePos = writingPos; const PtNodeParams childPartPtNodeParams(getUpdatedPtNodeParams(reallocatingPtNodeParams, - reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isBlacklisted(), + reallocatingPtNodeParams->isNotAWord(), reallocatingPtNodeParams->isPossiblyOffensive(), reallocatingPtNodeParams->isTerminal(), firstPartOfReallocatedPtNodePos, - reallocatingPtNodeParams->getCodePointCount() - overlappingCodePointCount, - reallocatingPtNodeParams->getCodePoints() + overlappingCodePointCount, + reallocatingPtNodeParams->getCodePointArrayView().skip(overlappingCodePointCount), reallocatingPtNodeParams->getProbability())); if (!mPtNodeWriter->writePtNodeAndAdvancePosition(&childPartPtNodeParams, &writingPos)) { return false; } if (addsExtraChild) { const PtNodeParams extraChildPtNodeParams(getPtNodeParamsForNewPtNode( - unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isNotAWord(), unigramProperty->isPossiblyOffensive(), true /* isTerminal */, firstPartOfReallocatedPtNodePos, - newNodeCodePointCount - overlappingCodePointCount, - newNodeCodePoints + overlappingCodePointCount, unigramProperty->getProbability())); + newPtNodeCodePoints.skip(overlappingCodePointCount), + unigramProperty->getProbability())); if (!mPtNodeWriter->writeNewTerminalPtNodeAndAdvancePosition(&extraChildPtNodeParams, unigramProperty, &writingPos)) { return false; @@ -282,26 +276,24 @@ bool DynamicPtUpdatingHelper::reallocatePtNodeAndAddNewPtNodes( } const PtNodeParams DynamicPtUpdatingHelper::getUpdatedPtNodeParams( - const PtNodeParams *const originalPtNodeParams, - const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, const int parentPos, - const int codePointCount, const int *const codePoints, const int probability) const { + const PtNodeParams *const originalPtNodeParams, const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, - false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(originalPtNodeParams, flags, parentPos, codePointCount, codePoints, - probability); + return PtNodeParams(originalPtNodeParams, flags, parentPos, codePoints, probability); } -const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode( - const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, - const int parentPos, const int codePointCount, const int *const codePoints, - const int probability) const { +const PtNodeParams DynamicPtUpdatingHelper::getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const { const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::createAndGetFlags( - isBlacklisted, isNotAWord, isTerminal, false /* hasShortcutTargets */, - false /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, + isPossiblyOffensive, isNotAWord, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, codePoints.size() > 1u /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); - return PtNodeParams(flags, parentPos, codePointCount, codePoints, probability); + return PtNodeParams(flags, parentPos, codePoints, probability); } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h index 97c05c1ea..e8cf98c39 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_updating_helper.h @@ -18,12 +18,12 @@ #define LATINIME_DYNAMIC_PT_UPDATING_HELPER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_params.h" #include "utils/int_array_view.h" namespace latinime { -class BigramProperty; +class NgramProperty; class BufferWithExtendableBuffer; class DynamicPtReadingHelper; class PtNodeReader; @@ -40,19 +40,21 @@ class DynamicPtUpdatingHelper { // Add a word to the dictionary. If the word already exists, update the probability. bool addUnigramWord(DynamicPtReadingHelper *const readingHelper, - const int *const wordCodePoints, const int codePointCount, - const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); + const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty, + bool *const outAddedNewUnigram); + // TODO: Remove after stopping supporting v402. // Add an n-gram entry. bool addNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); + // TODO: Remove after stopping supporting v402. // Remove an n-gram entry. bool removeNgramEntry(const PtNodePosArrayView prevWordsPtNodePos, const int wordPos); // Add a shortcut target. - bool addShortcutTarget(const int wordPos, const int *const targetCodePoints, - const int targetCodePointCount, const int shortcutProbability); + bool addShortcutTarget(const int wordPos, const CodePointArrayView targetCodePoints, + const int shortcutProbability); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPtUpdatingHelper); @@ -63,33 +65,32 @@ class DynamicPtUpdatingHelper { const PtNodeReader *const mPtNodeReader; PtNodeWriter *const mPtNodeWriter; - bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const UnigramProperty *const unigramProperty, + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, + const CodePointArrayView ptNodeCodePoints, const UnigramProperty *const unigramProperty, int *const forwardLinkFieldPos); bool setPtNodeProbability(const PtNodeParams *const originalPtNodeParams, const UnigramProperty *const unigramProperty, bool *const outAddedNewUnigram); bool createChildrenPtNodeArrayAndAChildPtNode(const PtNodeParams *const parentPtNodeParams, - const UnigramProperty *const unigramProperty, const int *const codePoints, - const int codePointCount); + const UnigramProperty *const unigramProperty, + const CodePointArrayView remainingCodePoints); - bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, - const int nodeCodePointCount, const UnigramProperty *const unigramProperty); + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, + const CodePointArrayView ptNodeCodePoints, + const UnigramProperty *const unigramProperty); - bool reallocatePtNodeAndAddNewPtNodes( - const PtNodeParams *const reallocatingPtNodeParams, const int overlappingCodePointCount, - const UnigramProperty *const unigramProperty, const int *const newNodeCodePoints, - const int newNodeCodePointCount); + bool reallocatePtNodeAndAddNewPtNodes(const PtNodeParams *const reallocatingPtNodeParams, + const size_t overlappingCodePointCount, const UnigramProperty *const unigramProperty, + const CodePointArrayView newPtNodeCodePoints); const PtNodeParams getUpdatedPtNodeParams(const PtNodeParams *const originalPtNodeParams, - const bool isNotAWord, const bool isBlacklisted, const bool isTerminal, - const int parentPos, const int codePointCount, - const int *const codePoints, const int probability) const; + const bool isNotAWord, const bool isPossiblyOffensive, const bool isTerminal, + const int parentPos, const CodePointArrayView codePoints, const int probability) const; - const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, const bool isBlacklisted, - const bool isTerminal, const int parentPos, - const int codePointCount, const int *const codePoints, const int probability) const; + const PtNodeParams getPtNodeParamsForNewPtNode(const bool isNotAWord, + const bool isPossiblyOffensive, const bool isTerminal, const int parentPos, + const CodePointArrayView codePoints, const int probability) const; }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_UPDATING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp index 664aeebbb..ea760a538 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.cpp @@ -14,13 +14,13 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" #include <cstddef> #include <cstdint> #include <cstdlib> -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h index 362fbd1cc..b4817af41 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h +++ b/native/jni/src/dictionary/structure/pt_common/dynamic_pt_writing_utils.h @@ -20,7 +20,7 @@ #include <cstddef> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp index e64a13cc4..e2807c492 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp +++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" #include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { @@ -41,8 +41,8 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08 const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; // Flag for non-words (typically, shortcut only entries) const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; -// Flag for blacklist -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; +// Flag for possibly offensive words +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_POSSIBLY_OFFENSIVE = 0x01; /* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( const uint8_t *const buffer, int *const pos) { @@ -61,19 +61,20 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; } /* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, - int *const pos) { - return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); + const int *const codePointTable, int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, codePointTable, pos); } // Returns the number of read characters. /* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, - const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { + const NodeFlags flags, const int maxLength, const int *const codePointTable, + int *const outBuffer, int *const pos) { int length = 0; if (hasMultipleChars(flags)) { - length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, - pos); + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, codePointTable, + outBuffer, pos); } else { - const int codePoint = getCodePointAndAdvancePosition(buffer, pos); + const int codePoint = getCodePointAndAdvancePosition(buffer, codePointTable, pos); if (codePoint == NOT_A_CODE_POINT) { // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR @@ -92,12 +93,12 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; // Returns the number of skipped characters. /* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const pos) { + const int maxLength, const int *const codePointTable, int *const pos) { if (hasMultipleChars(flags)) { return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); } else { if (maxLength > 0) { - getCodePointAndAdvancePosition(buffer, pos); + getCodePointAndAdvancePosition(buffer, codePointTable, pos); return 1; } else { return 0; @@ -134,7 +135,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; /* static */ void PtReadingUtils::readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, - const DictionaryBigramsStructurePolicy *const bigramPolicy, + const DictionaryBigramsStructurePolicy *const bigramPolicy, const int *const codePointTable, NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, int *const outSiblingPos) { @@ -142,7 +143,7 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; const NodeFlags flags = getFlagsAndAdvancePosition(dictBuf, &readingPos); *outFlags = flags; *outCodePointCount = getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, outCodePoint, &readingPos); + dictBuf, flags, MAX_WORD_LENGTH, codePointTable, outCodePoint, &readingPos); *outProbability = isTerminal(flags) ? readProbabilityAndAdvancePosition(dictBuf, &readingPos) : NOT_A_PROBABILITY; *outChildrenPos = hasChildrenInFlags(flags) ? diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h index c3f09c3b1..6a2bf5d3c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h +++ b/native/jni/src/dictionary/structure/pt_common/patricia_trie_reading_utils.h @@ -34,15 +34,17 @@ class PatriciaTrieReadingUtils { static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); - static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos); + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, + const int *const codePointTable, int *const pos); // Returns the number of read characters. static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const outBuffer, int *const pos); + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos); // Returns the number of skipped characters. static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, - const int maxLength, int *const pos); + const int maxLength, const int *const codePointTable, int *const pos); static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); @@ -52,8 +54,8 @@ class PatriciaTrieReadingUtils { /** * Node Flags */ - static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) { - return (flags & FLAG_IS_BLACKLISTED) != 0; + static AK_FORCE_INLINE bool isPossiblyOffensive(const NodeFlags flags) { + return (flags & FLAG_IS_POSSIBLY_OFFENSIVE) != 0; } static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { @@ -80,12 +82,12 @@ class PatriciaTrieReadingUtils { return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); } - static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted, + static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isPossiblyOffensive, const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, const bool hasBigrams, const bool hasMultipleChars, const int childrenPositionFieldSize) { NodeFlags nodeFlags = 0; - nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags; + nodeFlags = isPossiblyOffensive ? (nodeFlags | FLAG_IS_POSSIBLY_OFFENSIVE) : nodeFlags; nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; @@ -106,9 +108,10 @@ class PatriciaTrieReadingUtils { static void readPtNodeInfo(const uint8_t *const dictBuf, const int ptNodePos, const DictionaryShortcutsStructurePolicy *const shortcutPolicy, const DictionaryBigramsStructurePolicy *const bigramPolicy, - NodeFlags *const outFlags, int *const outCodePointCount, int *const outCodePoint, - int *const outProbability, int *const outChildrenPos, int *const outShortcutPos, - int *const outBigramPos, int *const outSiblingPos); + const int *const codePointTable, NodeFlags *const outFlags, + int *const outCodePointCount, int *const outCodePoint, int *const outProbability, + int *const outChildrenPos, int *const outShortcutPos, int *const outBigramPos, + int *const outSiblingPos); private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); @@ -124,7 +127,7 @@ class PatriciaTrieReadingUtils { static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; static const NodeFlags FLAG_HAS_BIGRAMS; static const NodeFlags FLAG_IS_NOT_A_WORD; - static const NodeFlags FLAG_IS_BLACKLISTED; + static const NodeFlags FLAG_IS_POSSIBLY_OFFENSIVE; }; } // namespace latinime #endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h index 6078d8285..6078d8285 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_array_reader.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h index b2e60a837..905deb1bc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_params.h @@ -20,10 +20,11 @@ #include <cstring> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" #include "utils/char_utils.h" +#include "utils/int_array_view.h" namespace latinime { @@ -88,9 +89,9 @@ class PtNodeParams { // Construct new params by updating existing PtNode params. PtNodeParams(const PtNodeParams *const ptNodeParams, const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, - const int codePointCount, const int *const codePoints, const int probability) + const CodePointArrayView codePoints, const int probability) : mHeadPos(ptNodeParams->getHeadPos()), mFlags(flags), mHasMovedFlag(true), - mParentPos(parentPos), mCodePointCount(codePointCount), mCodePoints(), + mParentPos(parentPos), mCodePointCount(codePoints.size()), mCodePoints(), mTerminalIdFieldPos(ptNodeParams->getTerminalIdFieldPos()), mTerminalId(ptNodeParams->getTerminalId()), mProbabilityFieldPos(ptNodeParams->getProbabilityFieldPos()), @@ -101,20 +102,20 @@ class PtNodeParams { mShortcutPos(ptNodeParams->getShortcutPos()), mBigramPos(ptNodeParams->getBigramsPos()), mSiblingPos(ptNodeParams->getSiblingNodePos()) { - memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); } PtNodeParams(const PatriciaTrieReadingUtils::NodeFlags flags, const int parentPos, - const int codePointCount, const int *const codePoints, const int probability) + const CodePointArrayView codePoints, const int probability) : mHeadPos(NOT_A_DICT_POS), mFlags(flags), mHasMovedFlag(true), mParentPos(parentPos), - mCodePointCount(codePointCount), mCodePoints(), + mCodePointCount(codePoints.size()), mCodePoints(), mTerminalIdFieldPos(NOT_A_DICT_POS), mTerminalId(Ver4DictConstants::NOT_A_TERMINAL_ID), mProbabilityFieldPos(NOT_A_DICT_POS), mProbability(probability), mChildrenPosFieldPos(NOT_A_DICT_POS), mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_DICT_POS) { - memcpy(mCodePoints, codePoints, sizeof(int) * mCodePointCount); + memcpy(mCodePoints, codePoints.data(), sizeof(int) * mCodePointCount); } AK_FORCE_INLINE bool isValid() const { @@ -143,8 +144,8 @@ class PtNodeParams { return PatriciaTrieReadingUtils::isTerminal(mFlags); } - AK_FORCE_INLINE bool isBlacklisted() const { - return PatriciaTrieReadingUtils::isBlacklisted(mFlags); + AK_FORCE_INLINE bool isPossiblyOffensive() const { + return PatriciaTrieReadingUtils::isPossiblyOffensive(mFlags); } AK_FORCE_INLINE bool isNotAWord() const { @@ -174,11 +175,17 @@ class PtNodeParams { return mParentPos; } + AK_FORCE_INLINE const CodePointArrayView getCodePointArrayView() const { + return CodePointArrayView(mCodePoints, mCodePointCount); + } + + // TODO: Remove // Number of code points AK_FORCE_INLINE uint8_t getCodePointCount() const { return mCodePointCount; } + // TODO: Remove AK_FORCE_INLINE const int *getCodePoints() const { return mCodePoints; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h index 31299a707..15da19e0b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_reader.h @@ -19,7 +19,7 @@ #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_params.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h index 955d779ac..e6cad25aa 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h +++ b/native/jni/src/dictionary/structure/pt_common/pt_node_writer.h @@ -20,12 +20,12 @@ #include <unordered_map> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_params.h" #include "utils/int_array_view.h" namespace latinime { -class BigramProperty; +class NgramProperty; class UnigramProperty; // Interface class used to write PtNode information. @@ -72,7 +72,7 @@ class PtNodeWriter { const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) = 0; virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) = 0; + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry) = 0; virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) = 0; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp index 91c76941c..14428edd4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp +++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { @@ -31,21 +31,23 @@ const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; /* static */ ShortcutListReadingUtils::ShortcutFlags - ShortcutListReadingUtils::getFlagsAndForwardPointer(const uint8_t *const dictRoot, + ShortcutListReadingUtils::getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition(dictRoot, pos); + return ByteArrayUtils::readUint8AndAdvancePosition(buffer.data(), pos); } /* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( - const uint8_t *const dictRoot, int *const pos) { + const ReadOnlyByteArrayView buffer, int *const pos) { // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. - return ByteArrayUtils::readUint16AndAdvancePosition(dictRoot, pos) + return ByteArrayUtils::readUint16AndAdvancePosition(buffer.data(), pos) - SHORTCUT_LIST_SIZE_FIELD_SIZE; } -/* static */ int ShortcutListReadingUtils::readShortcutTarget( - const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { - return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); +/* static */ int ShortcutListReadingUtils::readShortcutTarget(const ReadOnlyByteArrayView buffer, + const int maxLength, int *const outWord, int *const pos) { + // TODO: Use codePointTable for shortcuts. + return ByteArrayUtils::readStringAndAdvancePosition(buffer.data(), maxLength, + nullptr /* codePointTable */, outWord, pos); } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h index d065bf7fd..71cb8cc2c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h +++ b/native/jni/src/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h @@ -20,6 +20,7 @@ #include <cstdint> #include "defines.h" +#include "utils/byte_array_view.h" namespace latinime { @@ -27,7 +28,8 @@ class ShortcutListReadingUtils { public: typedef uint8_t ShortcutFlags; - static ShortcutFlags getFlagsAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + static ShortcutFlags getFlagsAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { return flags & MASK_ATTRIBUTE_PROBABILITY; @@ -39,14 +41,15 @@ class ShortcutListReadingUtils { // This method returns the size of the shortcut list region excluding the shortcut list size // field at the beginning. - static int getShortcutListSizeAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + static int getShortcutListSizeAndForwardPointer(const ReadOnlyByteArrayView buffer, + int *const pos); static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { return SHORTCUT_LIST_SIZE_FIELD_SIZE; } - static AK_FORCE_INLINE void skipShortcuts(const uint8_t *const dictRoot, int *const pos) { - const int shortcutListSize = getShortcutListSizeAndForwardPointer(dictRoot, pos); + static AK_FORCE_INLINE void skipShortcuts(const ReadOnlyByteArrayView buffer, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(buffer, pos); *pos += shortcutListSize; } @@ -54,7 +57,7 @@ class ShortcutListReadingUtils { return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; } - static int readShortcutTarget(const uint8_t *const dictRoot, const int maxLength, + static int readShortcutTarget(const ReadOnlyByteArrayView buffer, const int maxLength, int *const outWord, int *const pos); private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h index 73e291ec2..25081fa04 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h +++ b/native/jni/src/dictionary/structure/v2/bigram/bigram_list_policy.h @@ -20,24 +20,24 @@ #include <cstdint> #include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" +#include "dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" +#include "utils/byte_array_view.h" namespace latinime { class BigramListPolicy : public DictionaryBigramsStructurePolicy { public: - BigramListPolicy(const uint8_t *const bigramsBuf, const int bufSize) - : mBigramsBuf(bigramsBuf), mBufSize(bufSize) {} + BigramListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} ~BigramListPolicy() {} void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const pos) const { BigramListReadWriteUtils::BigramFlags flags; - if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBigramsBuf, - mBufSize, &flags, outBigramPos, pos)) { - AKLOGE("Cannot read bigram entry. mBufSize: %d, pos: %d. ", mBufSize, *pos); + if (!BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBuffer, &flags, + outBigramPos, pos)) { + AKLOGE("Cannot read bigram entry. bufSize: %zd, pos: %d. ", mBuffer.size(), *pos); *outProbability = NOT_A_PROBABILITY; *outHasNext = false; return; @@ -47,14 +47,13 @@ class BigramListPolicy : public DictionaryBigramsStructurePolicy { } bool skipAllBigrams(int *const pos) const { - return BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, mBufSize, pos); + return BigramListReadWriteUtils::skipExistingBigrams(mBuffer, pos); } private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); - const uint8_t *const mBigramsBuf; - const int mBufSize; + const ReadOnlyByteArrayView mBuffer; }; } // namespace latinime #endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp index ea32eb2a9..4e8b96b08 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.cpp @@ -14,18 +14,18 @@ * limitations under the License. */ - -#include "suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h" +#include "dictionary/structure/v2/patricia_trie_policy.h" #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/session/prev_words_info.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" #include "utils/char_utils.h" namespace latinime { @@ -36,19 +36,19 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo return; } int nextPos = dicNode->getChildrenPtNodeArrayPos(); - if (nextPos < 0 || nextPos >= mDictBufferSize) { - AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", - nextPos, mDictBufferSize); + if (!isValidPos(nextPos)) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %zd", + nextPos, mBuffer.size()); mIsCorrupted = true; ASSERT(false); return; } const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mDictRoot, &nextPos); + mBuffer.data(), &nextPos); for (int i = 0; i < childCount; i++) { - if (nextPos < 0 || nextPos >= mDictBufferSize) { - AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", - nextPos, mDictBufferSize, i, childCount); + if (!isValidPos(nextPos)) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %zd, childCount: %d / %d", + nextPos, mBuffer.size(), i, childCount); mIsCorrupted = true; ASSERT(false); return; @@ -57,7 +57,12 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo } } -// This retrieves code points and the probability of the word by its terminal position. +int PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + return getCodePointsAndProbabilityAndReturnCodePointCount(wordId, maxCodePointCount, + outCodePoints, nullptr /* outUnigramProbability */); +} +// This retrieves code points and the probability of the word by its id. // Due to the fact that words are ordered in the dictionary in a strict breadth-first order, // it is possible to check for this with advantageous complexity. For each PtNode array, we search // for PtNodes with children and compare the children position with the position we look for. @@ -68,18 +73,22 @@ void PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNo // with a z, it's the last PtNode of the root array, so all children addresses will be smaller // than the position we look for, and we have to descend the z PtNode). /* Parameters : - * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is - * what is stored as the "bigram position" in each bigram) + * wordId: Id of the word we are searching for. * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. * outUnigramProbability: a pointer to an int to write the probability into. * Return value : the code point count, of 0 if the word was not found. */ // TODO: Split this function to be more readable int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + const int wordId, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); int pos = getRootPosition(); int wordPos = 0; + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + if (outUnigramProbability) { + *outUnigramProbability = NOT_A_PROBABILITY; + } // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will // only traverse PtNodes that are actually a part of the terminal we are searching, so each // time we enter this loop we are one depth level further than last time. @@ -90,56 +99,57 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int lastCandidatePtNodePos = 0; // Let's loop through PtNodes in this PtNode array searching for either the terminal // or one of its ascendants. - if (pos < 0 || pos >= mDictBufferSize) { - AKLOGE("PtNode array position is invalid. pos: %d, dict size: %d", - pos, mDictBufferSize); + if (!isValidPos(pos)) { + AKLOGE("PtNode array position is invalid. pos: %d, dict size: %zd", + pos, mBuffer.size()); mIsCorrupted = true; ASSERT(false); - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { + mBuffer.data(), &pos); ptNodeCount > 0; --ptNodeCount) { const int startPos = pos; - if (pos < 0 || pos >= mDictBufferSize) { - AKLOGE("PtNode position is invalid. pos: %d, dict size: %d", pos, mDictBufferSize); + if (!isValidPos(pos)) { + AKLOGE("PtNode position is invalid. pos: %d, dict size: %zd", pos, mBuffer.size()); mIsCorrupted = true; ASSERT(false); - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mBuffer.data(), &pos); const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &pos); + mBuffer.data(), codePointTable, &pos); if (ptNodePos == startPos) { // We found the position. Copy the rest of the code points in the buffer and return // the length. outCodePoints[wordPos] = character; if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &pos); + mBuffer.data(), codePointTable, &pos); // We count code points in order to avoid infinite loops if the file is broken // or if there is some other bug int charCount = maxCodePointCount; while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { outCodePoints[++wordPos] = nextChar; nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &pos); + mBuffer.data(), codePointTable, &pos); } } - *outUnigramProbability = - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, - &pos); + if (outUnigramProbability) { + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mBuffer.data(), &pos); + } return ++wordPos; } // We need to skip past this PtNode, so skip any remaining code points after the // first and possibly the probability. if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { - PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + PatriciaTrieReadingUtils::skipCharacters(mBuffer.data(), flags, MAX_WORD_LENGTH, + codePointTable, &pos); } if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &pos); } // The fact that this PtNode has children is very important. Since we already know // that this PtNode does not match, if it has no children we know it is irrelevant @@ -154,7 +164,8 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( int currentPos = pos; // Here comes the tricky part. First, read the children position. const int childrenPos = PatriciaTrieReadingUtils - ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); + ::readChildrenPositionAndAdvancePosition(mBuffer.data(), flags, + ¤tPos); if (childrenPos > ptNodePos) { // If the children pos is greater than the position, it means the previous // PtNode, which position is stored in lastCandidatePtNodePos, was the right @@ -184,30 +195,30 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( if (0 != lastCandidatePtNodePos) { const PatriciaTrieReadingUtils::NodeFlags lastFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( - mDictRoot, &lastCandidatePtNodePos); + mBuffer.data(), &lastCandidatePtNodePos); const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); // We copy all the characters in this PtNode to the buffer outCodePoints[wordPos] = lastChar; if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); int charCount = maxCodePointCount; while (-1 != nextChar && --charCount > 0) { outCodePoints[++wordPos] = nextChar; nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( - mDictRoot, &lastCandidatePtNodePos); + mBuffer.data(), codePointTable, &lastCandidatePtNodePos); } } ++wordPos; // Now we only need to branch to the children address. Skip the probability if // it's there, read pos, and break to resume the search at pos. if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mBuffer.data(), &lastCandidatePtNodePos); } pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mDictRoot, lastFlags, &lastCandidatePtNodePos); + mBuffer.data(), lastFlags, &lastCandidatePtNodePos); break; } else { // Here is a little tricky part: we come here if we found out that all children @@ -219,18 +230,17 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( // ready to start the next one. if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mDictRoot, flags, &pos); + mBuffer.data(), flags, &pos); } if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { mShortcutListPolicy.skipAllShortcuts(&pos); } if (PatriciaTrieReadingUtils::hasBigrams(flags)) { if (!mBigramListPolicy.skipAllBigrams(&pos)) { - AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize, + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); mIsCorrupted = true; ASSERT(false); - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } } @@ -243,17 +253,16 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( // our pos is after the end of this PtNode, at the start of the next one. if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - mDictRoot, flags, &pos); + mBuffer.data(), flags, &pos); } if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { mShortcutListPolicy.skipAllShortcuts(&pos); } if (PatriciaTrieReadingUtils::hasBigrams(flags)) { if (!mBigramListPolicy.skipAllBigrams(&pos)) { - AKLOGE("Cannot skip bigrams. BufSize: %d, pos: %d.", mDictBufferSize, pos); + AKLOGE("Cannot skip bigrams. BufSize: %zd, pos: %d.", mBuffer.size(), pos); mIsCorrupted = true; ASSERT(false); - *outUnigramProbability = NOT_A_PROBABILITY; return 0; } } @@ -267,18 +276,48 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( } // This function gets the position of the terminal PtNode of the exact matching word in the -// dictionary. If no match is found, it returns NOT_A_DICT_POS. -int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const { +// dictionary. If no match is found, it returns NOT_A_WORD_ID. +int PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { DynamicPtReadingHelper readingHelper(&mPtNodeReader, &mPtNodeArrayReader); readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = - readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); if (readingHelper.isError()) { mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + AKLOGE("Dictionary reading error in getWordId()."); + } + return getWordIdFromTerminalPtNodePos(ptNodePos); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); + const PtNodeParams ptNodeParams = + mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (multiBigramMap) { + const int probability = multiBigramMap->getBigramProbability(this /* structurePolicy */, + prevWordIds, wordId, ptNodeParams.getProbability()); + return getWordAttributes(probability, ptNodeParams); + } + if (!prevWordIds.empty()) { + const int bigramProbability = getProbabilityOfWord(prevWordIds, wordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return getWordAttributes(bigramProbability, ptNodeParams); + } } - return ptNodePos; + return getWordAttributes(getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY), + ptNodeParams); +} + +const WordAttributes PatriciaTriePolicy::getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const { + return WordAttributes(probability, false /* isBlacklisted */, ptNodeParams.isNotAWord(), + ptNodeParams.isPossiblyOffensive()); } int PatriciaTriePolicy::getProbability(const int unigramProbability, @@ -297,21 +336,22 @@ int PatriciaTriePolicy::getProbability(const int unigramProbability, } } -int PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, - const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { +int PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID) { return NOT_A_PROBABILITY; } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams = mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.isNotAWord() || ptNodeParams.isBlacklisted()) { - // If this is not a word, or if it's a blacklisted entry, it should behave as - // having no probability outside of the suggestion process (where it should be used - // for shortcuts). + if (ptNodeParams.isNotAWord()) { + // If this is not a word, it should behave as having no probability outside of the + // suggestion process (where it should be used for shortcuts). return NOT_A_PROBABILITY; } - if (prevWordsPtNodePos) { - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); + if (!prevWordIds.empty()) { + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); while (bigramsIt.hasNext()) { bigramsIt.next(); @@ -325,19 +365,26 @@ int PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodeP return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); } -void PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, +void PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const { - if (!prevWordsPtNodePos) { + if (prevWordIds.empty()) { return; } - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); + const int bigramsPosition = getBigramsPositionOfPtNode( + getTerminalPtNodePosFromWordId(prevWordIds[0])); BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramsPosition); while (bigramsIt.hasNext()) { bigramsIt.next(); - listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); + listener->onVisitEntry(bigramsIt.getProbability(), + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos())); } } +BinaryDictionaryShortcutIterator PatriciaTriePolicy::getShortcutIterator(const int wordId) const { + const int shortcutPos = getShortcutPositionOfPtNode(getTerminalPtNodePosFromWordId(wordId)); + return BinaryDictionaryShortcutIterator(&mShortcutListPolicy, shortcutPos); +} + int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; @@ -362,35 +409,32 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod int shortcutPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS; - PatriciaTrieReadingUtils::readPtNodeInfo(mDictRoot, ptNodePos, getShortcutsStructurePolicy(), - &mBigramListPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, - &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + const int *const codePointTable = mHeaderPolicy.getCodePointTable(); + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, &mShortcutListPolicy, + &mBigramListPolicy, codePointTable, &flags, &mergedNodeCodePointCount, + mergedNodeCodePoints, &probability, &childrenPos, &shortcutPos, &bigramPos, + &siblingPos); // Skip PtNodes don't start with Unicode code point because they represent non-word information. if (CharUtils::isInUnicodeSpace(mergedNodeCodePoints[0])) { - childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, - PatriciaTrieReadingUtils::isTerminal(flags), - PatriciaTrieReadingUtils::hasChildrenInFlags(flags), - PatriciaTrieReadingUtils::isBlacklisted(flags) - || PatriciaTrieReadingUtils::isNotAWord(flags), - mergedNodeCodePointCount, mergedNodeCodePoints); + const int wordId = PatriciaTrieReadingUtils::isTerminal(flags) ? ptNodePos : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, childrenPos, wordId, + CodePointArrayView(mergedNodeCodePoints, mergedNodeCodePointCount)); } return siblingPos; } -const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoints, - const int codePointCount) const { - const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, - false /* forceLowerCaseSearch */); - if (ptNodePos == NOT_A_DICT_POS) { +const WordProperty PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { AKLOGE("getWordProperty was called for invalid word."); return WordProperty(); } + const int ptNodePos = getTerminalPtNodePosFromWordId(wordId); const PtNodeParams ptNodeParams = mPtNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - std::vector<int> codePointVector(ptNodeParams.getCodePoints(), - ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); // Fetch bigram information. - std::vector<BigramProperty> bigrams; + std::vector<NgramProperty> ngrams; const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); int bigramWord1CodePoints[MAX_WORD_LENGTH]; BinaryDictionaryBigramsIterator bigramsIt(&mBigramListPolicy, bigramListPos); @@ -401,13 +445,14 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin if (bigramsIt.getBigramPos() != NOT_A_DICT_POS) { int word1Probability = NOT_A_PROBABILITY; const int word1CodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramWord1CodePoints, - &word1Probability); - const std::vector<int> word1(bigramWord1CodePoints, - bigramWord1CodePoints + word1CodePointCount); + getWordIdFromTerminalPtNodePos(bigramsIt.getBigramPos()), MAX_WORD_LENGTH, + bigramWord1CodePoints, &word1Probability); const int probability = getProbability(word1Probability, bigramsIt.getProbability()); - bigrams.emplace_back(&word1, probability, - NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */); + ngrams.emplace_back( + NgramContext(wordCodePoints.data(), wordCodePoints.size(), + ptNodeParams.representsBeginningOfSentence()), + CodePointArrayView(bigramWord1CodePoints, word1CodePointCount).toVector(), + probability, HistoricalInfo()); } } // Fetch shortcut information. @@ -415,25 +460,25 @@ const WordProperty PatriciaTriePolicy::getWordProperty(const int *const codePoin int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); if (shortcutPos != NOT_A_DICT_POS) { int shortcutTargetCodePoints[MAX_WORD_LENGTH]; - ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mDictRoot, &shortcutPos); + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &shortcutPos); bool hasNext = true; while (hasNext) { const ShortcutListReadingUtils::ShortcutFlags shortcutFlags = - ShortcutListReadingUtils::getFlagsAndForwardPointer(mDictRoot, &shortcutPos); + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, &shortcutPos); hasNext = ShortcutListReadingUtils::hasNext(shortcutFlags); const int shortcutTargetLength = ShortcutListReadingUtils::readShortcutTarget( - mDictRoot, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); - const std::vector<int> shortcutTarget(shortcutTargetCodePoints, - shortcutTargetCodePoints + shortcutTargetLength); + mBuffer, MAX_WORD_LENGTH, shortcutTargetCodePoints, &shortcutPos); const int shortcutProbability = ShortcutListReadingUtils::getProbabilityFromFlags(shortcutFlags); - shortcuts.emplace_back(&shortcutTarget, shortcutProbability); + shortcuts.emplace_back( + CodePointArrayView(shortcutTargetCodePoints, shortcutTargetLength).toVector(), + shortcutProbability); } } const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), - NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); - return WordProperty(&codePointVector, &unigramProperty, &bigrams); + ptNodeParams.isNotAWord(), ptNodeParams.isPossiblyOffensive(), + ptNodeParams.getProbability(), HistoricalInfo(), std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); } int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, @@ -455,9 +500,8 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC return 0; } const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - int unigramProbability = NOT_A_PROBABILITY; - *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount(terminalPtNodePos, - MAX_WORD_LENGTH, outCodePoints, &unigramProbability); + *outCodePointCount = getCodePointsAndReturnCodePointCount( + getWordIdFromTerminalPtNodePos(terminalPtNodePos), MAX_WORD_LENGTH, outCodePoints); const int nextToken = token + 1; if (nextToken >= terminalPtNodePositionsVectorSize) { // All words have been iterated. @@ -467,4 +511,16 @@ int PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outC return nextToken; } +int PatriciaTriePolicy::getWordIdFromTerminalPtNodePos(const int ptNodePos) const { + return ptNodePos == NOT_A_DICT_POS ? NOT_A_WORD_ID : ptNodePos; +} + +int PatriciaTriePolicy::getTerminalPtNodePosFromWordId(const int wordId) const { + return wordId == NOT_A_WORD_ID ? NOT_A_DICT_POS : wordId; +} + +bool PatriciaTriePolicy::isValidPos(const int pos) const { + return pos >= 0 && pos < static_cast<int>(mBuffer.size()); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h index 70351d147..8edfa7d10 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/dictionary/structure/v2/patricia_trie_policy.h @@ -21,35 +21,36 @@ #include <vector> #include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/bigram/bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/v2/bigram/bigram_list_policy.h" +#include "dictionary/structure/v2/shortcut/shortcut_list_policy.h" +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" +#include "dictionary/utils/format_utils.h" +#include "dictionary/utils/mmapped_buffer.h" #include "utils/byte_array_view.h" +#include "utils/int_array_view.h" namespace latinime { class DicNode; class DicNodeVector; +// Word id = Position of a PtNode that represents the word. +// Max supported n-gram is bigram. class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: PatriciaTriePolicy(MmappedBuffer::MmappedBufferPtr mmappedBuffer) : mMmappedBuffer(std::move(mmappedBuffer)), mHeaderPolicy(mMmappedBuffer->getReadOnlyByteArrayView().data(), - FormatUtils::VERSION_2), - mDictRoot(mMmappedBuffer->getReadOnlyByteArrayView().data() - + mHeaderPolicy.getSize()), - mDictBufferSize(mMmappedBuffer->getReadOnlyByteArrayView().size() - - mHeaderPolicy.getSize()), - mBigramListPolicy(mDictRoot, mDictBufferSize), mShortcutListPolicy(mDictRoot), - mPtNodeReader(mDictRoot, mDictBufferSize, &mBigramListPolicy, &mShortcutListPolicy), - mPtNodeArrayReader(mDictRoot, mDictBufferSize), - mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {} + FormatUtils::detectFormatVersion(mMmappedBuffer->getReadOnlyByteArrayView())), + mBuffer(mMmappedBuffer->getReadOnlyByteArrayView().skip(mHeaderPolicy.getSize())), + mBigramListPolicy(mBuffer), mShortcutListPolicy(mBuffer), + mPtNodeReader(mBuffer, &mBigramListPolicy, &mShortcutListPolicy, + mHeaderPolicy.getCodePointTable()), + mPtNodeArrayReader(mBuffer), mTerminalPtNodePositionsForIteratingWords(), + mIsCorrupted(false) {} AK_FORCE_INLINE int getRootPosition() const { return 0; @@ -58,57 +59,62 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const; - int getCodePointsAndProbabilityAndReturnCodePointCount( - const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const; + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; + + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - int getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const; + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; int getProbability(const int unigramProbability, const int bigramProbability) const; - int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const; + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - void iterateNgramEntries(const int *const prevWordsPtNodePos, + void iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const; - int getShortcutPositionOfPtNode(const int ptNodePos) const; + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { return &mHeaderPolicy; } - const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { - return &mShortcutListPolicy; - } - - bool addUnigramEntry(const int *const word, const int length, + bool addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); return false; } - bool removeUnigramEntry(const int *const word, const int length) { + bool removeUnigramEntry(const CodePointArrayView wordCodePoints) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); return false; } - bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty) { + bool addNgramEntry(const NgramProperty *const ngramProperty) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); return false; } - bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, - const int length) { + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); return false; } + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + bool flush(const char *const filePath) { // This method should not be called for non-updatable dictionary. AKLOGI("Warning: flush() is called for non-updatable dictionary."); @@ -135,8 +141,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } } - const WordProperty getWordProperty(const int *const codePoints, - const int codePointCount) const; + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; int getNextWordAndNextToken(const int token, int *const outCodePoints, int *const outCodePointCount); @@ -150,8 +155,7 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const MmappedBuffer::MmappedBufferPtr mMmappedBuffer; const HeaderPolicy mHeaderPolicy; - const uint8_t *const mDictRoot; - const int mDictBufferSize; + const ReadOnlyByteArrayView mBuffer; const BigramListPolicy mBigramListPolicy; const ShortcutListPolicy mShortcutListPolicy; const Ver2ParticiaTrieNodeReader mPtNodeReader; @@ -159,9 +163,18 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { std::vector<int> mTerminalPtNodePositionsForIteratingWords; mutable bool mIsCorrupted; + int getCodePointsAndProbabilityAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; int getBigramsPositionOfPtNode(const int ptNodePos) const; int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, DicNodeVector *const childDicNodes) const; + int getWordIdFromTerminalPtNodePos(const int ptNodePos) const; + int getTerminalPtNodePosFromWordId(const int wordId) const; + const WordAttributes getWordAttributes(const int probability, + const PtNodeParams &ptNodeParams) const; + bool isValidPos(const int pos) const; }; } // namespace latinime #endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h index 8e16ccc05..995b1ed01 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/shortcut/shortcut_list_policy.h +++ b/native/jni/src/dictionary/structure/v2/shortcut/shortcut_list_policy.h @@ -20,15 +20,15 @@ #include <cstdint> #include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "utils/byte_array_view.h" namespace latinime { class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { public: - explicit ShortcutListPolicy(const uint8_t *const shortcutBuf) - : mShortcutsBuf(shortcutBuf) {} + explicit ShortcutListPolicy(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {} ~ShortcutListPolicy() {} @@ -37,7 +37,7 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { return NOT_A_DICT_POS; } int listPos = pos; - ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mShortcutsBuf, &listPos); + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mBuffer, &listPos); return listPos; } @@ -45,7 +45,7 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, int *const pos) const { const ShortcutListReadingUtils::ShortcutFlags flags = - ShortcutListReadingUtils::getFlagsAndForwardPointer(mShortcutsBuf, pos); + ShortcutListReadingUtils::getFlagsAndForwardPointer(mBuffer, pos); if (outHasNext) { *outHasNext = ShortcutListReadingUtils::hasNext(flags); } @@ -54,20 +54,20 @@ class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { } if (outCodePoint) { *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( - mShortcutsBuf, maxCodePointCount, outCodePoint, pos); + mBuffer, maxCodePointCount, outCodePoint, pos); } } void skipAllShortcuts(int *const pos) const { const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos); + ::getShortcutListSizeAndForwardPointer(mBuffer, pos); *pos += shortcutListSize; } private: DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); - const uint8_t *const mShortcutsBuf; + const ReadOnlyByteArrayView mBuffer; }; } // namespace latinime #endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp index c1e938710..cbb8ead81 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp +++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.cpp @@ -14,18 +14,18 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h" +#include "dictionary/structure/v2/ver2_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" namespace latinime { const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNodePos( const int ptNodePos) const { - if (ptNodePos < 0 || ptNodePos >= mDictSize) { + if (ptNodePos < 0 || ptNodePos >= static_cast<int>(mBuffer.size())) { // Reading invalid position because of bug or broken dictionary. - AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %d", - ptNodePos, mDictSize); + AKLOGE("Fetching PtNode info from invalid dictionary position: %d, dictionary size: %zd", + ptNodePos, mBuffer.size()); ASSERT(false); return PtNodeParams(); } @@ -37,9 +37,9 @@ const PtNodeParams Ver2ParticiaTrieNodeReader::fetchPtNodeParamsInBufferFromPtNo int shortcutPos = NOT_A_DICT_POS; int bigramPos = NOT_A_DICT_POS; int siblingPos = NOT_A_DICT_POS; - PatriciaTrieReadingUtils::readPtNodeInfo(mDictBuffer, ptNodePos, mShortuctPolicy, - mBigramPolicy, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, &probability, - &childrenPos, &shortcutPos, &bigramPos, &siblingPos); + PatriciaTrieReadingUtils::readPtNodeInfo(mBuffer.data(), ptNodePos, mShortcutPolicy, + mBigramPolicy, mCodePointTable, &flags, &mergedNodeCodePointCount, mergedNodeCodePoints, + &probability, &childrenPos, &shortcutPos, &bigramPos, &siblingPos); if (mergedNodeCodePointCount <= 0) { AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); ASSERT(false); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h index f0725b66d..dc87c7c68 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_patricia_trie_node_reader.h +++ b/native/jni/src/dictionary/structure/v2/ver2_patricia_trie_node_reader.h @@ -20,8 +20,9 @@ #include <cstdint> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" +#include "utils/byte_array_view.h" namespace latinime { @@ -30,21 +31,22 @@ class DictionaryShortcutsStructurePolicy; class Ver2ParticiaTrieNodeReader : public PtNodeReader { public: - Ver2ParticiaTrieNodeReader(const uint8_t *const dictBuffer, const int dictSize, + Ver2ParticiaTrieNodeReader(const ReadOnlyByteArrayView buffer, const DictionaryBigramsStructurePolicy *const bigramPolicy, - const DictionaryShortcutsStructurePolicy *const shortcutPolicy) - : mDictBuffer(dictBuffer), mDictSize(dictSize), mBigramPolicy(bigramPolicy), - mShortuctPolicy(shortcutPolicy) {} + const DictionaryShortcutsStructurePolicy *const shortcutPolicy, + const int *const codePointTable) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy), + mCodePointTable(codePointTable) {} virtual const PtNodeParams fetchPtNodeParamsInBufferFromPtNodePos(const int ptNodePos) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver2ParticiaTrieNodeReader); - const uint8_t *const mDictBuffer; - const int mDictSize; + const ReadOnlyByteArrayView mBuffer; const DictionaryBigramsStructurePolicy *const mBigramPolicy; - const DictionaryShortcutsStructurePolicy *const mShortuctPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + const int *const mCodePointTable; }; } // namespace latinime #endif /* LATINIME_VER2_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp index b46617d96..8b9b02df1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.cpp +++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.cpp @@ -14,24 +14,24 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h" +#include "dictionary/structure/v2/ver2_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" namespace latinime { bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, int *const outPtNodeCount, int *const outFirstPtNodePos) const { - if (ptNodeArrayPos < 0 || ptNodeArrayPos >= mDictSize) { + if (ptNodeArrayPos < 0 || ptNodeArrayPos >= static_cast<int>(mBuffer.size())) { // Reading invalid position because of a bug or a broken dictionary. - AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %d", - ptNodeArrayPos, mDictSize); + AKLOGE("Reading PtNode array info from invalid dictionary position: %d, dict size: %zd", + ptNodeArrayPos, mBuffer.size()); ASSERT(false); return false; } int readingPos = ptNodeArrayPos; const int ptNodeCountInArray = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mDictBuffer, &readingPos); + mBuffer.data(), &readingPos); *outPtNodeCount = ptNodeCountInArray; *outFirstPtNodePos = readingPos; return true; @@ -39,10 +39,10 @@ bool Ver2PtNodeArrayReader::readPtNodeArrayInfoAndReturnIfValid(const int ptNode bool Ver2PtNodeArrayReader::readForwardLinkAndReturnIfValid(const int forwordLinkPos, int *const outNextPtNodeArrayPos) const { - if (forwordLinkPos < 0 || forwordLinkPos >= mDictSize) { + if (forwordLinkPos < 0 || forwordLinkPos >= static_cast<int>(mBuffer.size())) { // Reading invalid position because of bug or broken dictionary. - AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %d", - forwordLinkPos, mDictSize); + AKLOGE("Reading forward link from invalid dictionary position: %d, dict size: %zd", + forwordLinkPos, mBuffer.size()); ASSERT(false); return false; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h index 548272148..32fa96d15 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/ver2_pt_node_array_reader.h +++ b/native/jni/src/dictionary/structure/v2/ver2_pt_node_array_reader.h @@ -20,14 +20,14 @@ #include <cstdint> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" +#include "utils/byte_array_view.h" namespace latinime { class Ver2PtNodeArrayReader : public PtNodeArrayReader { public: - Ver2PtNodeArrayReader(const uint8_t *const dictBuffer, const int dictSize) - : mDictBuffer(dictBuffer), mDictSize(dictSize) {}; + Ver2PtNodeArrayReader(const ReadOnlyByteArrayView buffer) : mBuffer(buffer) {}; virtual bool readPtNodeArrayInfoAndReturnIfValid(const int ptNodeArrayPos, int *const outPtNodeCount, int *const outFirstPtNodePos) const; @@ -37,8 +37,7 @@ class Ver2PtNodeArrayReader : public PtNodeArrayReader { private: DISALLOW_COPY_AND_ASSIGN(Ver2PtNodeArrayReader); - const uint8_t *const mDictBuffer; - const int mDictSize; + const ReadOnlyByteArrayView mBuffer; }; } // namespace latinime #endif /* LATINIME_VER2_PT_NODE_ARRAY_READER_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp new file mode 100644 index 000000000..165947f87 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" + +namespace latinime { + +// Used to provide stable probabilities even if the user's input count is small. +const int DynamicLanguageModelProbabilityUtils::ASSUMED_MIN_COUNTS[] = {8192, 2, 2, 1}; + +// Encoded backoff weights. +// Note that we give positive values for trigrams and quadgrams that means the weight is more than +// 1. +// TODO: Apply backoff for main dictionaries and quit giving a positive backoff weight. +const int DynamicLanguageModelProbabilityUtils::ENCODED_BACKOFF_WEIGHTS[] = {-32, -4, 2, 8}; + +// This value is used to remove too old entries from the dictionary. +const int DynamicLanguageModelProbabilityUtils::DURATION_TO_DISCARD_ENTRY_IN_SECONDS = + 300 * 24 * 60 * 60; // 300 days + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h new file mode 100644 index 000000000..71824c954 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/dynamic_language_model_probability_utils.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H +#define LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H + +#include <algorithm> + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "utils/ngram_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +class DynamicLanguageModelProbabilityUtils { + public: + static float computeRawProbabilityFromCounts(const int count, const int contextCount, + const NgramType ngramType) { + const int minCount = ASSUMED_MIN_COUNTS[static_cast<int>(ngramType)]; + return static_cast<float>(count) / static_cast<float>(std::max(contextCount, minCount)); + } + + static float backoff(const int ngramProbability, const NgramType ngramType) { + const int probability = + ngramProbability + ENCODED_BACKOFF_WEIGHTS[static_cast<int>(ngramType)]; + return std::min(std::max(probability, NOT_A_PROBABILITY), MAX_PROBABILITY); + } + + static int getDecayedProbability(const int probability, const HistoricalInfo historicalInfo) { + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + if (elapsedTime < 0) { + AKLOGE("The elapsed time is negatime value. Timestamp overflow?"); + return NOT_A_PROBABILITY; + } + // TODO: Improve this logic. + // We don't modify probability depending on the elapsed time. + return probability; + } + + static int shouldRemoveEntryDuringGC(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + const int elapsedTime = TimeKeeper::peekCurrentTime() - historicalInfo.getTimestamp(); + return elapsedTime > DURATION_TO_DISCARD_ENTRY_IN_SECONDS; + } + + static int getPriorityToPreventFromEviction(const HistoricalInfo historicalInfo) { + // TODO: Improve this logic. + // More recently input entries get higher priority. + return historicalInfo.getTimestamp(); + } + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicLanguageModelProbabilityUtils); + + static_assert(MAX_PREV_WORD_COUNT_FOR_N_GRAM <= 3, "Max supported Ngram is Quadgram."); + + static const int ASSUMED_MIN_COUNTS[]; + static const int ENCODED_BACKOFF_WEIGHTS[]; + static const int DURATION_TO_DISCARD_ENTRY_IN_SECONDS; +}; + +} // namespace latinime +#endif /* LATINIME_DYNAMIC_LANGUAGE_MODEL_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp new file mode 100644 index 000000000..c10e4906b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content.h" + +#include <algorithm> +#include <cstring> + +#include "dictionary/structure/v4/content/dynamic_language_model_probability_utils.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +const int LanguageModelDictContent::TRIE_MAP_BUFFER_INDEX = 0; +const int LanguageModelDictContent::GLOBAL_COUNTERS_BUFFER_INDEX = 1; + +bool LanguageModelDictContent::save(FILE *const file) const { + return mTrieMap.save(file) && mGlobalCounters.save(file); +} + +bool LanguageModelDictContent::runGC( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent) { + return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), + 0 /* nextLevelBitmapEntryIndex */); +} + +const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArrayView prevWordIds, + const int wordId, const bool mustMatchAllPrevWords, + const HeaderPolicy *const headerPolicy) const { + int bitmapEntryIndices[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + bitmapEntryIndices[0] = mTrieMap.getRootBitmapEntryIndex(); + int maxPrevWordCount = 0; + for (size_t i = 0; i < prevWordIds.size(); ++i) { + const int nextBitmapEntryIndex = + mTrieMap.get(prevWordIds[i], bitmapEntryIndices[i]).mNextLevelBitmapEntryIndex; + if (nextBitmapEntryIndex == TrieMap::INVALID_INDEX) { + break; + } + maxPrevWordCount = i + 1; + bitmapEntryIndices[i + 1] = nextBitmapEntryIndex; + } + + const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId); + if (mHasHistoricalInfo && unigramProbabilityEntry.getHistoricalInfo()->getCount() == 0) { + // The word should be treated as a invalid word. + return WordAttributes(); + } + for (int i = maxPrevWordCount; i >= 0; --i) { + if (mustMatchAllPrevWords && prevWordIds.size() > static_cast<size_t>(i)) { + break; + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndices[i]); + if (!result.mIsValid) { + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); + int probability = NOT_A_PROBABILITY; + if (mHasHistoricalInfo) { + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + int contextCount = 0; + if (i == 0) { + // unigram + contextCount = mGlobalCounters.getTotalCount(); + } else { + const ProbabilityEntry prevWordProbabilityEntry = getNgramProbabilityEntry( + prevWordIds.skip(1 /* n */).limit(i - 1), prevWordIds[0]); + if (!prevWordProbabilityEntry.isValid()) { + continue; + } + if (prevWordProbabilityEntry.representsBeginningOfSentence() + && historicalInfo->getCount() == 1) { + // BoS ngram requires multiple contextCount. + continue; + } + contextCount = prevWordProbabilityEntry.getHistoricalInfo()->getCount(); + } + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(i + 1); + const float rawProbability = + DynamicLanguageModelProbabilityUtils::computeRawProbabilityFromCounts( + historicalInfo->getCount(), contextCount, ngramType); + const int encodedRawProbability = + ProbabilityUtils::encodeRawProbability(rawProbability); + const int decayedProbability = + DynamicLanguageModelProbabilityUtils::getDecayedProbability( + encodedRawProbability, *historicalInfo); + probability = DynamicLanguageModelProbabilityUtils::backoff( + decayedProbability, ngramType); + } else { + probability = probabilityEntry.getProbability(); + } + // TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in + // probabilityEntry. + return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(), + unigramProbabilityEntry.isNotAWord(), + unigramProbabilityEntry.isPossiblyOffensive()); + } + // Cannot find the word. + return WordAttributes(); +} + +ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( + const WordIdArrayView prevWordIds, const int wordId) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return ProbabilityEntry(); + } + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + // Not found. + return ProbabilityEntry(); + } + return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); +} + +bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId, const ProbabilityEntry *const probabilityEntry) { + if (wordId == Ver4DictConstants::NOT_A_TERMINAL_ID) { + return false; + } + const int bitmapEntryIndex = createAndGetBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + return false; + } + return mTrieMap.put(wordId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); +} + +bool LanguageModelDictContent::removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { + // Cannot find bitmap entry for the probability entry. The entry doesn't exist. + return false; + } + return mTrieMap.remove(wordId, bitmapEntryIndex); +} + +LanguageModelDictContent::EntryRange LanguageModelDictContent::getProbabilityEntries( + const WordIdArrayView prevWordIds) const { + const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); + return EntryRange(mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex), mHasHistoricalInfo); +} + +std::vector<LanguageModelDictContent::DumppedFullEntryInfo> + LanguageModelDictContent::exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const { + const TrieMap::Result result = mTrieMap.getRoot(wordId); + if (!result.mIsValid || result.mNextLevelBitmapEntryIndex == TrieMap::INVALID_INDEX) { + // The word doesn't have any related ngram entries. + return std::vector<DumppedFullEntryInfo>(); + } + std::vector<int> prevWordIds = { wordId }; + std::vector<DumppedFullEntryInfo> entries; + exportAllNgramEntriesRelatedToWordInner(headerPolicy, result.mNextLevelBitmapEntryIndex, + &prevWordIds, &entries); + return entries; +} + +void LanguageModelDictContent::exportAllNgramEntriesRelatedToWordInner( + const HeaderPolicy *const headerPolicy, const int bitmapEntryIndex, + std::vector<int> *const prevWordIds, + std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + const int wordId = entry.key(); + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (probabilityEntry.isValid()) { + const WordAttributes wordAttributes = getWordAttributes( + WordIdArrayView(*prevWordIds), wordId, true /* mustMatchAllPrevWords */, + headerPolicy); + outBummpedFullEntryInfo->emplace_back(*prevWordIds, wordId, + wordAttributes, probabilityEntry); + } + if (entry.hasNextLevelMap()) { + prevWordIds->push_back(wordId); + exportAllNgramEntriesRelatedToWordInner(headerPolicy, + entry.getNextLevelBitmapEntryIndex(), prevWordIds, outBummpedFullEntryInfo); + prevWordIds->pop_back(); + } + } +} + +bool LanguageModelDictContent::truncateEntries(const EntryCounts ¤tEntryCounts, + const EntryCounts &maxEntryCounts, const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + for (int prevWordCount = 0; prevWordCount <= MAX_PREV_WORD_COUNT_FOR_N_GRAM; ++prevWordCount) { + const int totalWordCount = prevWordCount + 1; + const NgramType ngramType = NgramUtils::getNgramTypeFromWordCount(totalWordCount); + if (currentEntryCounts.getNgramCount(ngramType) + <= maxEntryCounts.getNgramCount(ngramType)) { + outEntryCounters->setNgramCount(ngramType, + currentEntryCounts.getNgramCount(ngramType)); + continue; + } + int entryCount = 0; + if (!turncateEntriesInSpecifiedLevel(headerPolicy, + maxEntryCounts.getNgramCount(ngramType), prevWordCount, &entryCount)) { + return false; + } + outEntryCounters->setNgramCount(ngramType, entryCount); + } + return true; +} + +bool LanguageModelDictContent::updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, + const int wordId, const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const entryCountersToUpdate) { + if (!mHasHistoricalInfo) { + AKLOGE("updateAllEntriesOnInputWord is called for dictionary without historical info."); + return false; + } + const ProbabilityEntry originalUnigramProbabilityEntry = getProbabilityEntry(wordId); + const ProbabilityEntry updatedUnigramProbabilityEntry = createUpdatedEntryFrom( + originalUnigramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setProbabilityEntry(wordId, &updatedUnigramProbabilityEntry)) { + return false; + } + mGlobalCounters.incrementTotalCount(); + mGlobalCounters.updateMaxValueOfCounters( + updatedUnigramProbabilityEntry.getHistoricalInfo()->getCount()); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] == NOT_A_WORD_ID) { + break; + } + // TODO: Optimize this code. + const WordIdArrayView limitedPrevWordIds = prevWordIds.limit(i + 1); + const ProbabilityEntry originalNgramProbabilityEntry = getNgramProbabilityEntry( + limitedPrevWordIds, wordId); + const ProbabilityEntry updatedNgramProbabilityEntry = createUpdatedEntryFrom( + originalNgramProbabilityEntry, isValid, historicalInfo, headerPolicy); + if (!setNgramProbabilityEntry(limitedPrevWordIds, wordId, &updatedNgramProbabilityEntry)) { + return false; + } + mGlobalCounters.updateMaxValueOfCounters( + updatedNgramProbabilityEntry.getHistoricalInfo()->getCount()); + if (!originalNgramProbabilityEntry.isValid()) { + // (i + 2) words are used in total because the prevWords consists of (i + 1) words when + // looking at its i-th element. + entryCountersToUpdate->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(i + 2)); + } + } + return true; +} + +const ProbabilityEntry LanguageModelDictContent::createUpdatedEntryFrom( + const ProbabilityEntry &originalProbabilityEntry, const bool isValid, + const HistoricalInfo historicalInfo, const HeaderPolicy *const headerPolicy) const { + const HistoricalInfo updatedHistoricalInfo = HistoricalInfo(historicalInfo.getTimestamp(), + 0 /* level */, originalProbabilityEntry.getHistoricalInfo()->getCount() + + historicalInfo.getCount()); + if (originalProbabilityEntry.isValid()) { + return ProbabilityEntry(originalProbabilityEntry.getFlags(), &updatedHistoricalInfo); + } else { + return ProbabilityEntry(0 /* flags */, &updatedHistoricalInfo); + } +} + +bool LanguageModelDictContent::runGCInner( + const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex) { + for (auto &entry : trieMapRange) { + const auto it = terminalIdMap->find(entry.key()); + if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { + // The word has been removed. + continue; + } + if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { + return false; + } + if (entry.hasNextLevelMap()) { + if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), + mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex))) { + return false; + } + } + } + return true; +} + +int LanguageModelDictContent::createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds) { + int lastBitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, lastBitmapEntryIndex); + if (result.mIsValid && result.mNextLevelBitmapEntryIndex != TrieMap::INVALID_INDEX) { + lastBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + continue; + } + if (!result.mIsValid) { + if (!mTrieMap.put(wordId, ProbabilityEntry().encode(mHasHistoricalInfo), + lastBitmapEntryIndex)) { + AKLOGE("Failed to update trie map. wordId: %d, lastBitmapEntryIndex %d", wordId, + lastBitmapEntryIndex); + return TrieMap::INVALID_INDEX; + } + } + lastBitmapEntryIndex = mTrieMap.getNextLevelBitmapEntryIndex(wordId, + lastBitmapEntryIndex); + } + return lastBitmapEntryIndex; +} + +int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { + int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); + for (const int wordId : prevWordIds) { + const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); + if (!result.mIsValid) { + return TrieMap::INVALID_INDEX; + } + bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + } + return bitmapEntryIndex; +} + +bool LanguageModelDictContent::updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, + const int prevWordCount, const HeaderPolicy *const headerPolicy, + const bool needsToHalveCounters, MutableEntryCounters *const outEntryCounters) { + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { + AKLOGE("Invalid prevWordCount. prevWordCount: %d, MAX_PREV_WORD_COUNT_FOR_N_GRAM: %d.", + prevWordCount, MAX_PREV_WORD_COUNT_FOR_N_GRAM); + return false; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + if (prevWordCount > 0 && probabilityEntry.isValid() + && !mTrieMap.getRoot(entry.key()).mIsValid) { + // The entry is related to a word that has been removed. Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (mHasHistoricalInfo && probabilityEntry.isValid()) { + const HistoricalInfo *originalHistoricalInfo = probabilityEntry.getHistoricalInfo(); + if (DynamicLanguageModelProbabilityUtils::shouldRemoveEntryDuringGC( + *originalHistoricalInfo)) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + if (needsToHalveCounters) { + const int updatedCount = originalHistoricalInfo->getCount() / 2; + if (updatedCount == 0) { + // Remove the entry. + if (!mTrieMap.remove(entry.key(), bitmapEntryIndex)) { + return false; + } + continue; + } + const HistoricalInfo historicalInfoToSave(originalHistoricalInfo->getTimestamp(), + originalHistoricalInfo->getLevel(), updatedCount); + const ProbabilityEntry updatedEntry(probabilityEntry.getFlags(), + &historicalInfoToSave); + if (!mTrieMap.put(entry.key(), updatedEntry.encode(mHasHistoricalInfo), + bitmapEntryIndex)) { + return false; + } + } + } + outEntryCounters->incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordCount + 1)); + if (!entry.hasNextLevelMap()) { + continue; + } + if (!updateAllProbabilityEntriesForGCInner(entry.getNextLevelBitmapEntryIndex(), + prevWordCount + 1, headerPolicy, needsToHalveCounters, outEntryCounters)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::turncateEntriesInSpecifiedLevel( + const HeaderPolicy *const headerPolicy, const int maxEntryCount, const int targetLevel, + int *const outEntryCount) { + std::vector<int> prevWordIds; + std::vector<EntryInfoToTurncate> entryInfoVector; + if (!getEntryInfo(headerPolicy, targetLevel, mTrieMap.getRootBitmapEntryIndex(), + &prevWordIds, &entryInfoVector)) { + return false; + } + if (static_cast<int>(entryInfoVector.size()) <= maxEntryCount) { + *outEntryCount = static_cast<int>(entryInfoVector.size()); + return true; + } + *outEntryCount = maxEntryCount; + const int entryCountToRemove = static_cast<int>(entryInfoVector.size()) - maxEntryCount; + std::partial_sort(entryInfoVector.begin(), entryInfoVector.begin() + entryCountToRemove, + entryInfoVector.end(), + EntryInfoToTurncate::Comparator()); + for (int i = 0; i < entryCountToRemove; ++i) { + const EntryInfoToTurncate &entryInfo = entryInfoVector[i]; + if (!removeNgramProbabilityEntry( + WordIdArrayView(entryInfo.mPrevWordIds, entryInfo.mPrevWordCount), + entryInfo.mKey)) { + return false; + } + } + return true; +} + +bool LanguageModelDictContent::getEntryInfo(const HeaderPolicy *const headerPolicy, + const int targetLevel, const int bitmapEntryIndex, std::vector<int> *const prevWordIds, + std::vector<EntryInfoToTurncate> *const outEntryInfo) const { + const int prevWordCount = prevWordIds->size(); + for (const auto &entry : mTrieMap.getEntriesInSpecifiedLevel(bitmapEntryIndex)) { + if (prevWordCount < targetLevel) { + if (!entry.hasNextLevelMap()) { + continue; + } + prevWordIds->push_back(entry.key()); + if (!getEntryInfo(headerPolicy, targetLevel, entry.getNextLevelBitmapEntryIndex(), + prevWordIds, outEntryInfo)) { + return false; + } + prevWordIds->pop_back(); + continue; + } + const ProbabilityEntry probabilityEntry = + ProbabilityEntry::decode(entry.value(), mHasHistoricalInfo); + const int priority = mHasHistoricalInfo + ? DynamicLanguageModelProbabilityUtils::getPriorityToPreventFromEviction( + *probabilityEntry.getHistoricalInfo()) + : probabilityEntry.getProbability(); + outEntryInfo->emplace_back(priority, probabilityEntry.getHistoricalInfo()->getCount(), + entry.key(), targetLevel, prevWordIds->data()); + } + return true; +} + +bool LanguageModelDictContent::EntryInfoToTurncate::Comparator::operator()( + const EntryInfoToTurncate &left, const EntryInfoToTurncate &right) const { + if (left.mPriority != right.mPriority) { + return left.mPriority < right.mPriority; + } + if (left.mCount != right.mCount) { + return left.mCount < right.mCount; + } + if (left.mKey != right.mKey) { + return left.mKey < right.mKey; + } + if (left.mPrevWordCount != right.mPrevWordCount) { + return left.mPrevWordCount > right.mPrevWordCount; + } + for (int i = 0; i < left.mPrevWordCount; ++i) { + if (left.mPrevWordIds[i] != right.mPrevWordIds[i]) { + return left.mPrevWordIds[i] < right.mPrevWordIds[i]; + } + } + // left and rigth represent the same entry. + return false; +} + +LanguageModelDictContent::EntryInfoToTurncate::EntryInfoToTurncate(const int priority, + const int count, const int key, const int prevWordCount, const int *const prevWordIds) + : mPriority(priority), mCount(count), mKey(key), mPrevWordCount(prevWordCount) { + memmove(mPrevWordIds, prevWordIds, mPrevWordCount * sizeof(mPrevWordIds[0])); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h new file mode 100644 index 000000000..db8c6e12b --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H + +#include <cstdio> +#include <vector> + +#include "defines.h" +#include "dictionary/property/word_attributes.h" +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/trie_map.h" +#include "utils/byte_array_view.h" +#include "utils/int_array_view.h" + +namespace latinime { + +class HeaderPolicy; + +/** + * Class representing language model. + * + * This class provides methods to get and store unigram/n-gram probability information and flags. + */ +class LanguageModelDictContent { + public: + // Pair of word id and probability entry used for iteration. + class WordIdAndProbabilityEntry { + public: + WordIdAndProbabilityEntry(const int wordId, const ProbabilityEntry &probabilityEntry) + : mWordId(wordId), mProbabilityEntry(probabilityEntry) {} + + int getWordId() const { return mWordId; } + const ProbabilityEntry getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(WordIdAndProbabilityEntry); + DISALLOW_ASSIGNMENT_OPERATOR(WordIdAndProbabilityEntry); + + const int mWordId; + const ProbabilityEntry mProbabilityEntry; + }; + + // Iterator. + class EntryIterator { + public: + EntryIterator(const TrieMap::TrieMapIterator &trieMapIterator, + const bool hasHistoricalInfo) + : mTrieMapIterator(trieMapIterator), mHasHistoricalInfo(hasHistoricalInfo) {} + + const WordIdAndProbabilityEntry operator*() const { + const TrieMap::TrieMapIterator::IterationResult &result = *mTrieMapIterator; + return WordIdAndProbabilityEntry( + result.key(), ProbabilityEntry::decode(result.value(), mHasHistoricalInfo)); + } + + bool operator!=(const EntryIterator &other) const { + return mTrieMapIterator != other.mTrieMapIterator; + } + + const EntryIterator &operator++() { + ++mTrieMapIterator; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryIterator); + DISALLOW_ASSIGNMENT_OPERATOR(EntryIterator); + + TrieMap::TrieMapIterator mTrieMapIterator; + const bool mHasHistoricalInfo; + }; + + // Class represents range to use range base for loops. + class EntryRange { + public: + EntryRange(const TrieMap::TrieMapRange trieMapRange, const bool hasHistoricalInfo) + : mTrieMapRange(trieMapRange), mHasHistoricalInfo(hasHistoricalInfo) {} + + EntryIterator begin() const { + return EntryIterator(mTrieMapRange.begin(), mHasHistoricalInfo); + } + + EntryIterator end() const { + return EntryIterator(mTrieMapRange.end(), mHasHistoricalInfo); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryRange); + DISALLOW_ASSIGNMENT_OPERATOR(EntryRange); + + const TrieMap::TrieMapRange mTrieMapRange; + const bool mHasHistoricalInfo; + }; + + class DumppedFullEntryInfo { + public: + DumppedFullEntryInfo(std::vector<int> &prevWordIds, const int targetWordId, + const WordAttributes &wordAttributes, const ProbabilityEntry &probabilityEntry) + : mPrevWordIds(prevWordIds), mTargetWordId(targetWordId), + mWordAttributes(wordAttributes), mProbabilityEntry(probabilityEntry) {} + + const WordIdArrayView getPrevWordIds() const { return WordIdArrayView(mPrevWordIds); } + int getTargetWordId() const { return mTargetWordId; } + const WordAttributes &getWordAttributes() const { return mWordAttributes; } + const ProbabilityEntry &getProbabilityEntry() const { return mProbabilityEntry; } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(DumppedFullEntryInfo); + + const std::vector<int> mPrevWordIds; + const int mTargetWordId; + const WordAttributes mWordAttributes; + const ProbabilityEntry mProbabilityEntry; + }; + + LanguageModelDictContent(const ReadWriteByteArrayView *const buffers, + const bool hasHistoricalInfo) + : mTrieMap(buffers[TRIE_MAP_BUFFER_INDEX]), + mGlobalCounters(buffers[GLOBAL_COUNTERS_BUFFER_INDEX]), + mHasHistoricalInfo(hasHistoricalInfo) {} + + explicit LanguageModelDictContent(const bool hasHistoricalInfo) + : mTrieMap(), mGlobalCounters(), mHasHistoricalInfo(hasHistoricalInfo) {} + + bool isNearSizeLimit() const { + return mTrieMap.isNearSizeLimit() || mGlobalCounters.needsToHalveCounters(); + } + + bool save(FILE *const file) const; + + bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const LanguageModelDictContent *const originalContent); + + const WordAttributes getWordAttributes(const WordIdArrayView prevWordIds, const int wordId, + const bool mustMatchAllPrevWords, const HeaderPolicy *const headerPolicy) const; + + ProbabilityEntry getProbabilityEntry(const int wordId) const { + return getNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { + mGlobalCounters.addToTotalCount(probabilityEntry->getHistoricalInfo()->getCount()); + return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); + } + + bool removeProbabilityEntry(const int wordId) { + return removeNgramProbabilityEntry(WordIdArrayView(), wordId); + } + + ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, + const int wordId) const; + + bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, + const ProbabilityEntry *const probabilityEntry); + + bool removeNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId); + + EntryRange getProbabilityEntries(const WordIdArrayView prevWordIds) const; + + std::vector<DumppedFullEntryInfo> exportAllNgramEntriesRelatedToWord( + const HeaderPolicy *const headerPolicy, const int wordId) const; + + bool updateAllProbabilityEntriesForGC(const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const outEntryCounters) { + if (!updateAllProbabilityEntriesForGCInner(mTrieMap.getRootBitmapEntryIndex(), + 0 /* prevWordCount */, headerPolicy, mGlobalCounters.needsToHalveCounters(), + outEntryCounters)) { + return false; + } + if (mGlobalCounters.needsToHalveCounters()) { + mGlobalCounters.halveCounters(); + } + return true; + } + + // entryCounts should be created by updateAllProbabilityEntries. + bool truncateEntries(const EntryCounts ¤tEntryCounts, const EntryCounts &maxEntryCounts, + const HeaderPolicy *const headerPolicy, MutableEntryCounters *const outEntryCounters); + + bool updateAllEntriesOnInputWord(const WordIdArrayView prevWordIds, const int wordId, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy, + MutableEntryCounters *const entryCountersToUpdate); + + private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); + + class EntryInfoToTurncate { + public: + class Comparator { + public: + bool operator()(const EntryInfoToTurncate &left, + const EntryInfoToTurncate &right) const; + private: + DISALLOW_ASSIGNMENT_OPERATOR(Comparator); + }; + + EntryInfoToTurncate(const int priority, const int count, const int key, + const int prevWordCount, const int *const prevWordIds); + + int mPriority; + // TODO: Remove. + int mCount; + int mKey; + int mPrevWordCount; + int mPrevWordIds[MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1]; + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(EntryInfoToTurncate); + }; + + static const int TRIE_MAP_BUFFER_INDEX; + static const int GLOBAL_COUNTERS_BUFFER_INDEX; + + TrieMap mTrieMap; + LanguageModelDictContentGlobalCounters mGlobalCounters; + const bool mHasHistoricalInfo; + + bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, + const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex); + int createAndGetBitmapEntryIndex(const WordIdArrayView prevWordIds); + int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; + bool updateAllProbabilityEntriesForGCInner(const int bitmapEntryIndex, const int prevWordCount, + const HeaderPolicy *const headerPolicy, const bool needsToHalveCounters, + MutableEntryCounters *const outEntryCounters); + bool turncateEntriesInSpecifiedLevel(const HeaderPolicy *const headerPolicy, + const int maxEntryCount, const int targetLevel, int *const outEntryCount); + bool getEntryInfo(const HeaderPolicy *const headerPolicy, const int targetLevel, + const int bitmapEntryIndex, std::vector<int> *const prevWordIds, + std::vector<EntryInfoToTurncate> *const outEntryInfo) const; + const ProbabilityEntry createUpdatedEntryFrom(const ProbabilityEntry &originalProbabilityEntry, + const bool isValid, const HistoricalInfo historicalInfo, + const HeaderPolicy *const headerPolicy) const; + void exportAllNgramEntriesRelatedToWordInner(const HeaderPolicy *const headerPolicy, + const int bitmapEntryIndex, std::vector<int> *const prevWordIds, + std::vector<DumppedFullEntryInfo> *const outBummpedFullEntryInfo) const; +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp new file mode 100644 index 000000000..89cf0e306 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/content/language_model_dict_content_global_counters.h" + +#include <climits> + +#include "dictionary/structure/v4/ver4_dict_constants.h" + +namespace latinime { + +const int LanguageModelDictContentGlobalCounters::COUNTER_VALUE_NEAR_LIMIT_THRESHOLD = + (1 << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - 64; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD = 1 << 30; +const int LanguageModelDictContentGlobalCounters::COUNTER_SIZE_IN_BYTES = 4; +const int LanguageModelDictContentGlobalCounters::TOTAL_COUNT_INDEX = 0; +const int LanguageModelDictContentGlobalCounters::MAX_VALUE_OF_COUNTERS_INDEX = 1; + +} // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h new file mode 100644 index 000000000..3f87c0ea0 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/content/language_model_dict_content_global_counters.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H +#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H + +#include <cstdio> + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class LanguageModelDictContentGlobalCounters { + public: + explicit LanguageModelDictContentGlobalCounters(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, 0 /* maxAdditionalBufferSize */), + mTotalCount(readValue(mBuffer, TOTAL_COUNT_INDEX)), + mMaxValueOfCounters(readValue(mBuffer, MAX_VALUE_OF_COUNTERS_INDEX)) {} + + LanguageModelDictContentGlobalCounters() + : mBuffer(0 /* maxAdditionalBufferSize */), mTotalCount(0), mMaxValueOfCounters(0) {} + + bool needsToHalveCounters() const { + return mMaxValueOfCounters >= COUNTER_VALUE_NEAR_LIMIT_THRESHOLD + || mTotalCount >= TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + } + + int getTotalCount() const { + return mTotalCount; + } + + bool save(FILE *const file) const { + BufferWithExtendableBuffer bufferToWrite( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!bufferToWrite.writeUint(mTotalCount, COUNTER_SIZE_IN_BYTES, + TOTAL_COUNT_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + if (!bufferToWrite.writeUint(mMaxValueOfCounters, COUNTER_SIZE_IN_BYTES, + MAX_VALUE_OF_COUNTERS_INDEX * COUNTER_SIZE_IN_BYTES)) { + return false; + } + return DictFileWritingUtils::writeBufferToFileTail(file, &bufferToWrite); + } + + void incrementTotalCount() { + mTotalCount += 1; + } + + void addToTotalCount(const int count) { + mTotalCount += count; + } + + void updateMaxValueOfCounters(const int count) { + mMaxValueOfCounters = std::max(count, mMaxValueOfCounters); + } + + void halveCounters() { + mMaxValueOfCounters /= 2; + mTotalCount /= 2; + } + +private: + DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContentGlobalCounters); + + const static int COUNTER_VALUE_NEAR_LIMIT_THRESHOLD; + const static int TOTAL_COUNT_VALUE_NEAR_LIMIT_THRESHOLD; + const static int COUNTER_SIZE_IN_BYTES; + const static int TOTAL_COUNT_INDEX; + const static int MAX_VALUE_OF_COUNTERS_INDEX; + + BufferWithExtendableBuffer mBuffer; + int mTotalCount; + int mMaxValueOfCounters; + + static int readValue(const BufferWithExtendableBuffer &buffer, const int index) { + const int pos = COUNTER_SIZE_IN_BYTES * index; + if (pos + COUNTER_SIZE_IN_BYTES > buffer.getTailPosition()) { + return 0; + } + return buffer.readUint(COUNTER_SIZE_IN_BYTES, pos); + } +}; +} // namespace latinime +#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_GLOBAL_COUNTERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h b/native/jni/src/dictionary/structure/v4/content/probability_entry.h index feff6b57f..473354b90 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h +++ b/native/jni/src/dictionary/structure/v4/content/probability_entry.h @@ -21,8 +21,10 @@ #include <cstdint> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/historical_info.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" namespace latinime { @@ -34,31 +36,40 @@ class ProbabilityEntry { // Dummy entry ProbabilityEntry() - : mFlags(0), mProbability(NOT_A_PROBABILITY), mHistoricalInfo() {} + : mFlags(Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY), mProbability(NOT_A_PROBABILITY), + mHistoricalInfo() {} // Entry without historical information ProbabilityEntry(const int flags, const int probability) : mFlags(flags), mProbability(probability), mHistoricalInfo() {} // Entry with historical information. - ProbabilityEntry(const int flags, const int probability, - const HistoricalInfo *const historicalInfo) - : mFlags(flags), mProbability(probability), mHistoricalInfo(*historicalInfo) {} - - const ProbabilityEntry createEntryWithUpdatedProbability(const int probability) const { - return ProbabilityEntry(mFlags, probability, &mHistoricalInfo); - } - - const ProbabilityEntry createEntryWithUpdatedHistoricalInfo( - const HistoricalInfo *const historicalInfo) const { - return ProbabilityEntry(mFlags, mProbability, historicalInfo); + ProbabilityEntry(const int flags, const HistoricalInfo *const historicalInfo) + : mFlags(flags), mProbability(NOT_A_PROBABILITY), mHistoricalInfo(*historicalInfo) {} + + // Create from unigram property. + ProbabilityEntry(const UnigramProperty *const unigramProperty) + : mFlags(createFlags(unigramProperty->representsBeginningOfSentence(), + unigramProperty->isNotAWord(), unigramProperty->isBlacklisted(), + unigramProperty->isPossiblyOffensive())), + mProbability(unigramProperty->getProbability()), + mHistoricalInfo(unigramProperty->getHistoricalInfo()) {} + + // Create from ngram property. + // TODO: Set flags. + ProbabilityEntry(const NgramProperty *const ngramProperty) + : mFlags(0), mProbability(ngramProperty->getProbability()), + mHistoricalInfo(ngramProperty->getHistoricalInfo()) {} + + bool isValid() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY) == 0; } bool hasHistoricalInfo() const { return mHistoricalInfo.isValid(); } - int getFlags() const { + uint8_t getFlags() const { return mFlags; } @@ -70,18 +81,34 @@ class ProbabilityEntry { return &mHistoricalInfo; } + bool representsBeginningOfSentence() const { + return (mFlags & Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE) != 0; + } + + bool isNotAWord() const { + return (mFlags & Ver4DictConstants::FLAG_NOT_A_WORD) != 0; + } + + bool isBlacklisted() const { + return (mFlags & Ver4DictConstants::FLAG_BLACKLISTED) != 0; + } + + bool isPossiblyOffensive() const { + return (mFlags & Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE) != 0; + } + uint64_t encode(const bool hasHistoricalInfo) const { - uint64_t encodedEntry = static_cast<uint64_t>(mFlags); + uint64_t encodedEntry = static_cast<uint8_t>(mFlags); if (hasHistoricalInfo) { encodedEntry = (encodedEntry << (Ver4DictConstants::TIME_STAMP_FIELD_SIZE * CHAR_BIT)) - ^ static_cast<uint64_t>(mHistoricalInfo.getTimeStamp()); + | static_cast<uint32_t>(mHistoricalInfo.getTimestamp()); encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_LEVEL_FIELD_SIZE * CHAR_BIT)) - ^ static_cast<uint64_t>(mHistoricalInfo.getLevel()); + | static_cast<uint8_t>(mHistoricalInfo.getLevel()); encodedEntry = (encodedEntry << (Ver4DictConstants::WORD_COUNT_FIELD_SIZE * CHAR_BIT)) - ^ static_cast<uint64_t>(mHistoricalInfo.getCount()); + | static_cast<uint16_t>(mHistoricalInfo.getCount()); } else { encodedEntry = (encodedEntry << (Ver4DictConstants::PROBABILITY_SIZE * CHAR_BIT)) - ^ static_cast<uint64_t>(mProbability); + | static_cast<uint8_t>(mProbability); } return encodedEntry; } @@ -89,7 +116,7 @@ class ProbabilityEntry { static ProbabilityEntry decode(const uint64_t encodedEntry, const bool hasHistoricalInfo) { if (hasHistoricalInfo) { const int flags = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, Ver4DictConstants::TIME_STAMP_FIELD_SIZE + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE + Ver4DictConstants::WORD_COUNT_FIELD_SIZE); @@ -103,10 +130,10 @@ class ProbabilityEntry { const int count = readFromEncodedEntry(encodedEntry, Ver4DictConstants::WORD_COUNT_FIELD_SIZE, 0 /* pos */); const HistoricalInfo historicalInfo(timestamp, level, count); - return ProbabilityEntry(flags, NOT_A_PROBABILITY, &historicalInfo); + return ProbabilityEntry(flags, &historicalInfo); } else { const int flags = readFromEncodedEntry(encodedEntry, - Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE, + Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE, Ver4DictConstants::PROBABILITY_SIZE); const int probability = readFromEncodedEntry(encodedEntry, Ver4DictConstants::PROBABILITY_SIZE, 0 /* pos */); @@ -118,7 +145,7 @@ class ProbabilityEntry { // Copy constructor is public to use this class as a type of return value. DISALLOW_ASSIGNMENT_OPERATOR(ProbabilityEntry); - const int mFlags; + const uint8_t mFlags; const int mProbability; const HistoricalInfo mHistoricalInfo; @@ -126,6 +153,24 @@ class ProbabilityEntry { return static_cast<int>( (encodedEntry >> (pos * CHAR_BIT)) & ((1ull << (size * CHAR_BIT)) - 1)); } + + static uint8_t createFlags(const bool representsBeginningOfSentence, + const bool isNotAWord, const bool isBlacklisted, const bool isPossiblyOffensive) { + uint8_t flags = 0; + if (representsBeginningOfSentence) { + flags |= Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + } + if (isNotAWord) { + flags |= Ver4DictConstants::FLAG_NOT_A_WORD; + } + if (isBlacklisted) { + flags |= Ver4DictConstants::FLAG_BLACKLISTED; + } + if (isPossiblyOffensive) { + flags |= Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE; + } + return flags; + } }; } // namespace latinime #endif /* LATINIME_PROBABILITY_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp index 41d9c544c..e3b419449 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.cpp +++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h index 7b12aff16..27de4e79e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h +++ b/native/jni/src/dictionary/structure/v4/content/shortcut_dict_content.h @@ -17,21 +17,21 @@ #ifndef LATINIME_SHORTCUT_DICT_CONTENT_H #define LATINIME_SHORTCUT_DICT_CONTENT_H -#include <cstdint> #include <cstdio> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" namespace latinime { +class ReadWriteByteArrayView; + class ShortcutDictContent : public SparseTableDictContent { public: - ShortcutDictContent(uint8_t *const *buffers, const int *bufferSizes) - : SparseTableDictContent(buffers, bufferSizes, - Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, + ShortcutDictContent(const ReadWriteByteArrayView *const buffers) + : SparseTableDictContent(buffers, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE, Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE) {} ShortcutDictContent() diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h index 921774181..6faa9a28b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h +++ b/native/jni/src/dictionary/structure/v4/content/single_dict_content.h @@ -17,22 +17,21 @@ #ifndef LATINIME_SINGLE_DICT_CONTENT_H #define LATINIME_SINGLE_DICT_CONTENT_H -#include <cstdint> #include <cstdio> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/dict_file_writing_utils.h" #include "utils/byte_array_view.h" namespace latinime { class SingleDictContent { public: - SingleDictContent(uint8_t *const buffer, const int bufferSize) - : mExpandableContentBuffer(ReadWriteByteArrayView(buffer, bufferSize), - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + SingleDictContent(const ReadWriteByteArrayView buffer) + : mExpandableContentBuffer(buffer, + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} SingleDictContent() : mExpandableContentBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE) {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp index 896ce6bd2..685365f36 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.cpp +++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" +#include "dictionary/structure/v4/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h index c98dd11fd..6245abc8e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h +++ b/native/jni/src/dictionary/structure/v4/content/sparse_table_dict_content.h @@ -17,13 +17,12 @@ #ifndef LATINIME_SPARSE_TABLE_DICT_CONTENT_H #define LATINIME_SPARSE_TABLE_DICT_CONTENT_H -#include <cstdint> #include <cstdio> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/sparse_table.h" #include "utils/byte_array_view.h" namespace latinime { @@ -31,19 +30,13 @@ namespace latinime { // TODO: Support multiple contents. class SparseTableDictContent { public: - AK_FORCE_INLINE SparseTableDictContent(uint8_t *const *buffers, const int *bufferSizes, + AK_FORCE_INLINE SparseTableDictContent(const ReadWriteByteArrayView *const buffers, const int sparseTableBlockSize, const int sparseTableDataSize) - : mExpandableLookupTableBuffer( - ReadWriteByteArrayView(buffers[LOOKUP_TABLE_BUFFER_INDEX], - bufferSizes[LOOKUP_TABLE_BUFFER_INDEX]), + : mExpandableLookupTableBuffer(buffers[LOOKUP_TABLE_BUFFER_INDEX], BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableAddressTableBuffer( - ReadWriteByteArrayView(buffers[ADDRESS_TABLE_BUFFER_INDEX], - bufferSizes[ADDRESS_TABLE_BUFFER_INDEX]), + mExpandableAddressTableBuffer(buffers[ADDRESS_TABLE_BUFFER_INDEX], BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableContentBuffer( - ReadWriteByteArrayView(buffers[CONTENT_BUFFER_INDEX], - bufferSizes[CONTENT_BUFFER_INDEX]), + mExpandableContentBuffer(buffers[CONTENT_BUFFER_INDEX], BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), mAddressLookupTable(&mExpandableLookupTableBuffer, &mExpandableAddressTableBuffer, sparseTableBlockSize, sparseTableDataSize) {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp index cf238ee5f..5503151fd 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.cpp +++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.cpp @@ -14,10 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -34,7 +33,7 @@ int TerminalPositionLookupTable::getTerminalPtNodePosition(const int terminalId) bool TerminalPositionLookupTable::setTerminalPtNodePosition( const int terminalId, const int terminalPtNodePos) { if (terminalId < 0) { - return NOT_A_DICT_POS; + return false; } while (terminalId >= mSize) { // Write new entry. diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h index b2262bf1e..f45ceb52d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h +++ b/native/jni/src/dictionary/structure/v4/content/terminal_position_lookup_table.h @@ -17,13 +17,13 @@ #ifndef LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H #define LATINIME_TERMINAL_POSITION_LOOKUP_TABLE_H -#include <cstdint> #include <cstdio> #include <unordered_map> #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/single_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/content/single_dict_content.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "utils/byte_array_view.h" namespace latinime { @@ -31,8 +31,8 @@ class TerminalPositionLookupTable : public SingleDictContent { public: typedef std::unordered_map<int, int> TerminalIdMap; - TerminalPositionLookupTable(uint8_t *const buffer, const int bufferSize) - : SingleDictContent(buffer, bufferSize), + TerminalPositionLookupTable(const ReadWriteByteArrayView buffer) + : SingleDictContent(buffer), mSize(getBuffer()->getTailPosition() / Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE) {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h index 790273541..25ab22543 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h +++ b/native/jni/src/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h @@ -18,10 +18,10 @@ #define LATINIME_VER4_SHORTCUT_LIST_POLICY_H #include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" +#include "dictionary/structure/pt_common/shortcut/shortcut_list_reading_utils.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp index 3c8008dc4..b0a82839b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" #include <cerrno> #include <cstring> @@ -23,9 +23,9 @@ #include <sys/types.h> #include <vector> -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/file_utils.h" #include "utils/byte_array_view.h" namespace latinime { @@ -45,16 +45,13 @@ namespace latinime { if (!bodyBuffer) { return Ver4DictBuffersPtr(nullptr); } - std::vector<uint8_t *> buffers; - std::vector<int> bufferSizes; + std::vector<ReadWriteByteArrayView> buffers; const ReadWriteByteArrayView buffer = bodyBuffer->getReadWriteByteArrayView(); int position = 0; while (position < static_cast<int>(buffer.size())) { const int bufferSize = ByteArrayUtils::readUint32AndAdvancePosition( buffer.data(), &position); - const ReadWriteByteArrayView subBuffer = buffer.subView(position, bufferSize); - buffers.push_back(subBuffer.data()); - bufferSizes.push_back(subBuffer.size()); + buffers.push_back(buffer.subView(position, bufferSize)); position += bufferSize; if (bufferSize < 0 || position < 0 || position > static_cast<int>(buffer.size())) { AKLOGE("The dict body file is corrupted."); @@ -66,7 +63,7 @@ namespace latinime { return Ver4DictBuffersPtr(nullptr); } return Ver4DictBuffersPtr(new Ver4DictBuffers(std::move(headerBuffer), std::move(bodyBuffer), - formatVersion, buffers, bufferSizes)); + formatVersion, buffers)); } bool Ver4DictBuffers::flushHeaderAndDictBuffers(const char *const dictDirPath, @@ -162,11 +159,6 @@ bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const { AKLOGE("Language model dict content cannot be written."); return false; } - // Write bigram dict content. - if (!mBigramDictContent.flushToFile(file)) { - AKLOGE("Bigram dict content cannot be written."); - return false; - } // Write shortcut dict content. if (!mShortcutDictContent.flushToFile(file)) { AKLOGE("Shortcut dict content cannot be written."); @@ -178,29 +170,18 @@ bool Ver4DictBuffers::flushDictBuffers(FILE *const file) const { Ver4DictBuffers::Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, MmappedBuffer::MmappedBufferPtr &&bodyBuffer, const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector<uint8_t *> &contentBuffers, const std::vector<int> &contentBufferSizes) + const std::vector<ReadWriteByteArrayView> &contentBuffers) : mHeaderBuffer(std::move(headerBuffer)), mDictBuffer(std::move(bodyBuffer)), mHeaderPolicy(mHeaderBuffer->getReadOnlyByteArrayView().data(), formatVersion), mExpandableHeaderBuffer(mHeaderBuffer->getReadWriteByteArrayView(), BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), - mExpandableTrieBuffer( - ReadWriteByteArrayView(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX], - contentBufferSizes[Ver4DictConstants::TRIE_BUFFER_INDEX]), + mExpandableTrieBuffer(contentBuffers[Ver4DictConstants::TRIE_BUFFER_INDEX], BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE), mTerminalPositionLookupTable( - contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX], - contentBufferSizes[ - Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]), - mLanguageModelDictContent( - ReadWriteByteArrayView( - contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX], - contentBufferSizes[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX]), - mHeaderPolicy.hasHistoricalInfoOfWords()), - mBigramDictContent(&contentBuffers[Ver4DictConstants::BIGRAM_BUFFERS_INDEX], - &contentBufferSizes[Ver4DictConstants::BIGRAM_BUFFERS_INDEX], + contentBuffers[Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX]), + mLanguageModelDictContent(&contentBuffers[Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX], mHeaderPolicy.hasHistoricalInfoOfWords()), - mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX], - &contentBufferSizes[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]), + mShortcutDictContent(&contentBuffers[Ver4DictConstants::SHORTCUT_BUFFERS_INDEX]), mIsUpdatable(mDictBuffer->isUpdatable()) {} Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize) @@ -208,7 +189,6 @@ Ver4DictBuffers::Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const i mExpandableHeaderBuffer(Ver4DictConstants::MAX_DICTIONARY_SIZE), mExpandableTrieBuffer(maxTrieSize), mTerminalPositionLookupTable(), mLanguageModelDictContent(headerPolicy->hasHistoricalInfoOfWords()), - mBigramDictContent(headerPolicy->hasHistoricalInfoOfWords()), mShortcutDictContent(), - mIsUpdatable(true) {} + mShortcutDictContent(), mIsUpdatable(true) {} } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h index 68027dcb8..c8270c93c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_buffers.h @@ -21,14 +21,13 @@ #include <memory> #include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/shortcut_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/shortcut_dict_content.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" namespace latinime { @@ -53,7 +52,6 @@ class Ver4DictBuffers { return mExpandableTrieBuffer.isNearSizeLimit() || mTerminalPositionLookupTable.isNearSizeLimit() || mLanguageModelDictContent.isNearSizeLimit() - || mBigramDictContent.isNearSizeLimit() || mShortcutDictContent.isNearSizeLimit(); } @@ -89,14 +87,6 @@ class Ver4DictBuffers { return &mLanguageModelDictContent; } - AK_FORCE_INLINE BigramDictContent *getMutableBigramDictContent() { - return &mBigramDictContent; - } - - AK_FORCE_INLINE const BigramDictContent *getBigramDictContent() const { - return &mBigramDictContent; - } - AK_FORCE_INLINE ShortcutDictContent *getMutableShortcutDictContent() { return &mShortcutDictContent; } @@ -122,8 +112,7 @@ class Ver4DictBuffers { Ver4DictBuffers(MmappedBuffer::MmappedBufferPtr &&headerBuffer, MmappedBuffer::MmappedBufferPtr &&bodyBuffer, const FormatUtils::FORMAT_VERSION formatVersion, - const std::vector<uint8_t *> &contentBuffers, - const std::vector<int> &contentBufferSizes); + const std::vector<ReadWriteByteArrayView> &contentBuffers); Ver4DictBuffers(const HeaderPolicy *const headerPolicy, const int maxTrieSize); @@ -136,7 +125,6 @@ class Ver4DictBuffers { BufferWithExtendableBuffer mExpandableTrieBuffer; TerminalPositionLookupTable mTerminalPositionLookupTable; LanguageModelDictContent mLanguageModelDictContent; - BigramDictContent mBigramDictContent; ShortcutDictContent mShortcutDictContent; const int mIsUpdatable; }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp index 93d4e562d..fd6907824 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" namespace latinime { @@ -29,52 +29,44 @@ const int Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE = 1 * 1024 * 1024; // NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT for Trie and TerminalAddressLookupTable. // NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT for language model. -// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for bigram and shortcut. +// NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT for shortcut. const size_t Ver4DictConstants::NUM_OF_CONTENT_BUFFERS_IN_BODY_FILE = NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT * 2 + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT - + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT * 2; + + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; const int Ver4DictConstants::TRIE_BUFFER_INDEX = 0; const int Ver4DictConstants::TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX = TRIE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; const int Ver4DictConstants::LANGUAGE_MODEL_BUFFER_INDEX = TERMINAL_ADDRESS_LOOKUP_TABLE_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT; -const int Ver4DictConstants::BIGRAM_BUFFERS_INDEX = - LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; const int Ver4DictConstants::SHORTCUT_BUFFERS_INDEX = - BIGRAM_BUFFERS_INDEX + NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT; + LANGUAGE_MODEL_BUFFER_INDEX + NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT; const int Ver4DictConstants::NOT_A_TERMINAL_ID = -1; const int Ver4DictConstants::PROBABILITY_SIZE = 1; -const int Ver4DictConstants::FLAGS_IN_PROBABILITY_FILE_SIZE = 1; +const int Ver4DictConstants::FLAGS_IN_LANGUAGE_MODEL_SIZE = 1; const int Ver4DictConstants::TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE = 3; const int Ver4DictConstants::NOT_A_TERMINAL_ADDRESS = 0; const int Ver4DictConstants::TERMINAL_ID_FIELD_SIZE = 4; const int Ver4DictConstants::TIME_STAMP_FIELD_SIZE = 4; -const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 1; -const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 1; +const int Ver4DictConstants::WORD_LEVEL_FIELD_SIZE = 0; +const int Ver4DictConstants::WORD_COUNT_FIELD_SIZE = 2; + +const uint8_t Ver4DictConstants::FLAG_REPRESENTS_BEGINNING_OF_SENTENCE = 0x1; +const uint8_t Ver4DictConstants::FLAG_NOT_A_VALID_ENTRY = 0x2; +const uint8_t Ver4DictConstants::FLAG_NOT_A_WORD = 0x4; +const uint8_t Ver4DictConstants::FLAG_BLACKLISTED = 0x8; +const uint8_t Ver4DictConstants::FLAG_POSSIBLY_OFFENSIVE = 0x10; -const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE = 16; -const int Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE = 4; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE = 64; const int Ver4DictConstants::SHORTCUT_ADDRESS_TABLE_DATA_SIZE = 4; -const int Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE = 3; -// Unsigned int max value of BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE-byte is used for representing -// invalid terminal ID in bigram lists. -const int Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID = - (1 << (BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE * 8)) - 1; -const int Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE = 1; -const int Ver4DictConstants::BIGRAM_PROBABILITY_MASK = 0x0F; -const int Ver4DictConstants::BIGRAM_IS_LINK_MASK = 0x80; -const int Ver4DictConstants::BIGRAM_LARGE_PROBABILITY_FIELD_SIZE = 1; - const int Ver4DictConstants::SHORTCUT_FLAGS_FIELD_SIZE = 1; const int Ver4DictConstants::SHORTCUT_PROBABILITY_MASK = 0x0F; const int Ver4DictConstants::SHORTCUT_HAS_NEXT_MASK = 0x80; const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SINGLE_DICT_CONTENT = 1; const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_SPARSE_TABLE_DICT_CONTENT = 3; -const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 1; +const size_t Ver4DictConstants::NUM_OF_BUFFERS_FOR_LANGUAGE_MODEL_DICT_CONTENT = 2; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h index 6950ca70f..13d7a5714 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h +++ b/native/jni/src/dictionary/structure/v4/ver4_dict_constants.h @@ -20,6 +20,7 @@ #include "defines.h" #include <cstddef> +#include <cstdint> namespace latinime { @@ -41,27 +42,24 @@ class Ver4DictConstants { static const int NOT_A_TERMINAL_ID; static const int PROBABILITY_SIZE; - static const int FLAGS_IN_PROBABILITY_FILE_SIZE; + static const int FLAGS_IN_LANGUAGE_MODEL_SIZE; static const int TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE; static const int NOT_A_TERMINAL_ADDRESS; static const int TERMINAL_ID_FIELD_SIZE; static const int TIME_STAMP_FIELD_SIZE; + // TODO: Remove static const int WORD_LEVEL_FIELD_SIZE; static const int WORD_COUNT_FIELD_SIZE; + // Flags in probability entry. + static const uint8_t FLAG_REPRESENTS_BEGINNING_OF_SENTENCE; + static const uint8_t FLAG_NOT_A_VALID_ENTRY; + static const uint8_t FLAG_NOT_A_WORD; + static const uint8_t FLAG_BLACKLISTED; + static const uint8_t FLAG_POSSIBLY_OFFENSIVE; - static const int BIGRAM_ADDRESS_TABLE_BLOCK_SIZE; - static const int BIGRAM_ADDRESS_TABLE_DATA_SIZE; static const int SHORTCUT_ADDRESS_TABLE_BLOCK_SIZE; static const int SHORTCUT_ADDRESS_TABLE_DATA_SIZE; - static const int BIGRAM_FLAGS_FIELD_SIZE; - static const int BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - static const int INVALID_BIGRAM_TARGET_TERMINAL_ID; - static const int BIGRAM_IS_LINK_MASK; - static const int BIGRAM_PROBABILITY_MASK; - // Used when bigram list has time stamp. - static const int BIGRAM_LARGE_PROBABILITY_FIELD_SIZE; - static const int SHORTCUT_FLAGS_FIELD_SIZE; static const int SHORTCUT_PROBABILITY_MASK; static const int SHORTCUT_HAS_NEXT_MASK; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp index 731092efd..b38b03dcb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.cpp @@ -14,15 +14,16 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/language_model_dict_content.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -50,26 +51,17 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce const int parentPos = DynamicPtReadingUtils::getParentPtNodePos(parentPosOffset, headPos); int codePoints[MAX_WORD_LENGTH]; - const int codePonitCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictBuf, flags, MAX_WORD_LENGTH, codePoints, &pos); + // Code point table is not used for ver4 dictionaries. + const int codePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, flags, MAX_WORD_LENGTH, nullptr /* codePointTable */, codePoints, &pos); int terminalIdFieldPos = NOT_A_DICT_POS; int terminalId = Ver4DictConstants::NOT_A_TERMINAL_ID; - int probability = NOT_A_PROBABILITY; if (PatriciaTrieReadingUtils::isTerminal(flags)) { terminalIdFieldPos = pos; if (usesAdditionalBuffer) { terminalIdFieldPos += mBuffer->getOriginalBufferSize(); } terminalId = Ver4PatriciaTrieReadingUtils::getTerminalIdAndAdvancePosition(dictBuf, &pos); - // TODO: Quit reading probability here. - const ProbabilityEntry probabilityEntry = - mLanguageModelDictContent->getProbabilityEntry(terminalId); - if (probabilityEntry.hasHistoricalInfo()) { - probability = ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mHeaderPolicy); - } else { - probability = probabilityEntry.getProbability(); - } } int childrenPosFieldPos = pos; if (usesAdditionalBuffer) { @@ -90,8 +82,8 @@ const PtNodeParams Ver4PatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProce // The destination position is stored at the same place as the parent position. return fetchPtNodeInfoFromBufferAndProcessMovedPtNode(parentPos, newSiblingNodePos); } else { - return PtNodeParams(headPos, flags, parentPos, codePonitCount, codePoints, - terminalIdFieldPos, terminalId, probability, childrenPosFieldPos, childrenPos, + return PtNodeParams(headPos, flags, parentPos, codePointCount, codePoints, + terminalIdFieldPos, terminalId, NOT_A_PROBABILITY, childrenPosFieldPos, childrenPos, newSiblingNodePos); } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h index a91ad5728..4e5ae3a89 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_reader.h @@ -18,8 +18,8 @@ #define LATINIME_VER4_PATRICIA_TRIE_NODE_READER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_reader.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_reader.h" namespace latinime { @@ -29,15 +29,12 @@ class LanguageModelDictContent; /* * This class is used for helping to read nodes of ver4 patricia trie. This class handles moved - * node and reads node attributes including probability form language model. + * node and reads node attributes. */ class Ver4PatriciaTrieNodeReader : public PtNodeReader { public: - Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, - const LanguageModelDictContent *const languageModelDictContent, - const HeaderPolicy *const headerPolicy) - : mBuffer(buffer), mLanguageModelDictContent(languageModelDictContent), - mHeaderPolicy(headerPolicy) {} + explicit Ver4PatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} ~Ver4PatriciaTrieNodeReader() {} @@ -50,8 +47,6 @@ class Ver4PatriciaTrieNodeReader : public PtNodeReader { DISALLOW_COPY_AND_ASSIGN(Ver4PatriciaTrieNodeReader); const BufferWithExtendableBuffer *const mBuffer; - const LanguageModelDictContent *const mLanguageModelDictContent; - const HeaderPolicy *const mHeaderPolicy; const PtNodeParams fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, const int siblingNodePos) const; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp index 857222f5d..d974b50f4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.cpp @@ -14,20 +14,19 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/forgetting_curve_utils.h" namespace latinime { @@ -62,6 +61,7 @@ bool Ver4PatriciaTrieNodeWriter::markPtNodeAsDeleted( } } +// TODO: Quit using bigramLinkedNodePos. bool Ver4PatriciaTrieNodeWriter::markPtNodeAsMoved( const PtNodeParams *const toBeUpdatedPtNodeParams, const int movedPos, const int bigramLinkedNodePos) { @@ -142,13 +142,9 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeUnigramProperty( if (!toBeUpdatedPtNodeParams->isTerminal()) { return false; } - const ProbabilityEntry originalProbabilityEntry = - mBuffers->getLanguageModelDictContent()->getProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId()); - const ProbabilityEntry probabilityEntry = createUpdatedEntryFrom(&originalProbabilityEntry, - unigramProperty); + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry); + toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntryOfUnigramProperty); } bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeAfterGC( @@ -160,29 +156,15 @@ bool Ver4PatriciaTrieNodeWriter::updatePtNodeProbabilityAndGetNeedsToKeepPtNodeA const ProbabilityEntry originalProbabilityEntry = mBuffers->getLanguageModelDictContent()->getProbabilityEntry( toBeUpdatedPtNodeParams->getTerminalId()); - if (originalProbabilityEntry.hasHistoricalInfo()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - originalProbabilityEntry.getHistoricalInfo(), mHeaderPolicy); - const ProbabilityEntry probabilityEntry = - originalProbabilityEntry.createEntryWithUpdatedHistoricalInfo(&historicalInfo); - if (!mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( - toBeUpdatedPtNodeParams->getTerminalId(), &probabilityEntry)) { - AKLOGE("Cannot write updated probability entry. terminalId: %d", - toBeUpdatedPtNodeParams->getTerminalId()); - return false; - } - const bool isValid = ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy); - if (!isValid) { - if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { - AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); - return false; - } - } - *outNeedsToKeepPtNode = isValid; - } else { - // No need to update probability. + if (originalProbabilityEntry.isValid()) { *outNeedsToKeepPtNode = true; + return true; } + if (!markPtNodeAsWillBecomeNonTerminal(toBeUpdatedPtNodeParams)) { + AKLOGE("Cannot mark PtNode as willBecomeNonTerminal."); + return false; + } + *outNeedsToKeepPtNode = false; return true; } @@ -205,7 +187,6 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndAdvancePosition( ptNodeWritingPos); } - bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( const PtNodeParams *const ptNodeParams, const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos) { @@ -216,31 +197,43 @@ bool Ver4PatriciaTrieNodeWriter::writeNewTerminalPtNodeAndAdvancePosition( } // Write probability. ProbabilityEntry newProbabilityEntry; - const ProbabilityEntry probabilityEntryToWrite = createUpdatedEntryFrom( - &newProbabilityEntry, unigramProperty); + const ProbabilityEntry probabilityEntryOfUnigramProperty = ProbabilityEntry(unigramProperty); return mBuffers->getMutableLanguageModelDictContent()->setProbabilityEntry( - terminalId, &probabilityEntryToWrite); + terminalId, &probabilityEntryOfUnigramProperty); } +// TODO: Support counting ngram entries. bool Ver4PatriciaTrieNodeWriter::addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const BigramProperty *const bigramProperty, bool *const outAddedNewBigram) { - if (!mBigramPolicy->addNewEntry(prevWordIds[0], wordId, bigramProperty, outAddedNewBigram)) { - AKLOGE("Cannot add new bigram entry. terminalId: %d, targetTerminalId: %d", - prevWordIds[0], wordId); + const NgramProperty *const ngramProperty, bool *const outAddedNewBigram) { + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + const ProbabilityEntry probabilityEntry = + languageModelDictContent->getNgramProbabilityEntry(prevWordIds, wordId); + const ProbabilityEntry probabilityEntryOfNgramProperty(ngramProperty); + if (!languageModelDictContent->setNgramProbabilityEntry( + prevWordIds, wordId, &probabilityEntryOfNgramProperty)) { + AKLOGE("Cannot add new ngram entry. prevWordId[0]: %d, prevWordId.size(): %zd, wordId: %d", + prevWordIds[0], prevWordIds.size(), wordId); return false; } + if (!probabilityEntry.isValid() && outAddedNewBigram) { + *outAddedNewBigram = true; + } return true; } bool Ver4PatriciaTrieNodeWriter::removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId) { - return mBigramPolicy->removeEntry(prevWordIds[0], wordId); + LanguageModelDictContent *const languageModelDictContent = + mBuffers->getMutableLanguageModelDictContent(); + return languageModelDictContent->removeNgramProbabilityEntry(prevWordIds, wordId); } +// TODO: Remove when we stop supporting v402 format. bool Ver4PatriciaTrieNodeWriter::updateAllBigramEntriesAndDeleteUselessEntries( const PtNodeParams *const sourcePtNodeParams, int *const outBigramEntryCount) { - return mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries( - sourcePtNodeParams->getTerminalId(), outBigramEntryCount); + // Do nothing. + return true; } bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( @@ -275,12 +268,6 @@ bool Ver4PatriciaTrieNodeWriter::updateAllPositionFields( if (!updateChildrenPosition(toBeUpdatedPtNodeParams, childrenPos)) { return false; } - - // Counts bigram entries. - if (outBigramEntryCount) { - *outBigramEntryCount = mBigramPolicy->getBigramEntryConut( - toBeUpdatedPtNodeParams->getTerminalId()); - } return true; } @@ -289,7 +276,7 @@ bool Ver4PatriciaTrieNodeWriter::addShortcutTarget(const PtNodeParams *const ptN const int shortcutProbability) { if (!mShortcutPolicy->addNewShortcut(ptNodeParams->getTerminalId(), targetCodePoints, targetCodePointCount, shortcutProbability)) { - AKLOGE("Cannot add new shortuct entry. terminalId: %d", ptNodeParams->getTerminalId()); + AKLOGE("Cannot add new shortcut entry. terminalId: %d", ptNodeParams->getTerminalId()); return false; } return true; @@ -346,37 +333,17 @@ bool Ver4PatriciaTrieNodeWriter::writePtNodeAndGetTerminalIdAndAdvancePosition( ptNodeParams->getChildrenPos(), ptNodeWritingPos)) { return false; } - return updatePtNodeFlags(nodePos, ptNodeParams->isBlacklisted(), ptNodeParams->isNotAWord(), - isTerminal, ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); -} - -const ProbabilityEntry Ver4PatriciaTrieNodeWriter::createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, - const UnigramProperty *const unigramProperty) const { - // TODO: Consolidate historical info and probability. - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo historicalInfoForUpdate(unigramProperty->getTimestamp(), - unigramProperty->getLevel(), unigramProperty->getCount()); - const HistoricalInfo updatedHistoricalInfo = - ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalProbabilityEntry->getHistoricalInfo(), - unigramProperty->getProbability(), &historicalInfoForUpdate, mHeaderPolicy); - return originalProbabilityEntry->createEntryWithUpdatedHistoricalInfo( - &updatedHistoricalInfo); - } else { - return originalProbabilityEntry->createEntryWithUpdatedProbability( - unigramProperty->getProbability()); - } + return updatePtNodeFlags(nodePos, isTerminal, + ptNodeParams->getCodePointCount() > 1 /* hasMultipleChars */); } -bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, - const bool isBlacklisted, const bool isNotAWord, const bool isTerminal, +bool Ver4PatriciaTrieNodeWriter::updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars) { // Create node flags and write them. PatriciaTrieReadingUtils::NodeFlags nodeFlags = - PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, isTerminal, - false /* hasShortcutTargets */, false /* hasBigrams */, hasMultipleChars, - CHILDREN_POSITION_FIELD_SIZE); + PatriciaTrieReadingUtils::createAndGetFlags(false /* isNotAWord */, + false /* isPossiblyOffensive */, isTerminal, false /* hasShortcutTargets */, + false /* hasBigrams */, hasMultipleChars, CHILDREN_POSITION_FIELD_SIZE); if (!DynamicPtWritingUtils::writeFlags(mTrieBuffer, nodeFlags, ptNodePos)) { AKLOGE("Cannot write PtNode flags. flags: %x, pos: %d", nodeFlags, ptNodePos); return false; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h index 6703dba04..55856110b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_node_writer.h @@ -18,16 +18,15 @@ #define LATINIME_VER4_PATRICIA_TRIE_NODE_WRITER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_params.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/pt_common/pt_node_params.h" +#include "dictionary/structure/pt_common/pt_node_writer.h" +#include "dictionary/structure/v4/content/probability_entry.h" namespace latinime { class BufferWithExtendableBuffer; class HeaderPolicy; -class Ver4BigramListPolicy; class Ver4DictBuffers; class Ver4PatriciaTrieNodeReader; class Ver4PtNodeArrayReader; @@ -39,13 +38,11 @@ class Ver4ShortcutListPolicy; class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { public: Ver4PatriciaTrieNodeWriter(BufferWithExtendableBuffer *const trieBuffer, - Ver4DictBuffers *const buffers, const HeaderPolicy *const headerPolicy, - const PtNodeReader *const ptNodeReader, + Ver4DictBuffers *const buffers, const PtNodeReader *const ptNodeReader, const PtNodeArrayReader *const ptNodeArrayReader, - Ver4BigramListPolicy *const bigramPolicy, Ver4ShortcutListPolicy *const shortcutPolicy) - : mTrieBuffer(trieBuffer), mBuffers(buffers), mHeaderPolicy(headerPolicy), - mReadingHelper(ptNodeReader, ptNodeArrayReader), mBigramPolicy(bigramPolicy), - mShortcutPolicy(shortcutPolicy) {} + Ver4ShortcutListPolicy *const shortcutPolicy) + : mTrieBuffer(trieBuffer), mBuffers(buffers), + mReadingHelper(ptNodeReader, ptNodeArrayReader), mShortcutPolicy(shortcutPolicy) {} virtual ~Ver4PatriciaTrieNodeWriter() {} @@ -76,7 +73,7 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const UnigramProperty *const unigramProperty, int *const ptNodeWritingPos); virtual bool addNgramEntry(const WordIdArrayView prevWordIds, const int wordId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); + const NgramProperty *const ngramProperty, bool *const outAddedNewEntry); virtual bool removeNgramEntry(const WordIdArrayView prevWordIds, const int wordId); @@ -98,23 +95,13 @@ class Ver4PatriciaTrieNodeWriter : public PtNodeWriter { const PtNodeParams *const ptNodeParams, int *const outTerminalId, int *const ptNodeWritingPos); - // Create updated probability entry using given unigram property. In addition to the - // probability, this method updates historical information if needed. - // TODO: Update flags belonging to the unigram property. - const ProbabilityEntry createUpdatedEntryFrom( - const ProbabilityEntry *const originalProbabilityEntry, - const UnigramProperty *const unigramProperty) const; - - bool updatePtNodeFlags(const int ptNodePos, const bool isBlacklisted, const bool isNotAWord, - const bool isTerminal, const bool hasMultipleChars); + bool updatePtNodeFlags(const int ptNodePos, const bool isTerminal, const bool hasMultipleChars); static const int CHILDREN_POSITION_FIELD_SIZE; BufferWithExtendableBuffer *const mTrieBuffer; Ver4DictBuffers *const mBuffers; - const HeaderPolicy *const mHeaderPolicy; DynamicPtReadingHelper mReadingHelper; - Ver4BigramListPolicy *const mBigramPolicy; Ver4ShortcutListPolicy *const mShortcutPolicy; }; } // namespace latinime diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp new file mode 100644 index 000000000..1dbec5545 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -0,0 +1,603 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_policy.h" + +#include <array> +#include <vector> + +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/ngram_property.h" +#include "dictionary/property/unigram_property.h" +#include "dictionary/property/word_property.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_helper.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/multi_bigram_map.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and +// BinaryDictionaryDecayingTests. +const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; +const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; +const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; +const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = + Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; + +void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); + while (!readingHelper.isEnd()) { + const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); + if (!ptNodeParams.isValid()) { + break; + } + const bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); + const int wordId = isTerminal ? ptNodeParams.getTerminalId() : NOT_A_WORD_ID; + childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getChildrenPos(), + wordId, ptNodeParams.getCodePointArrayView()); + readingHelper.readNextSiblingNode(ptNodeParams); + } + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } +} + +int Ver4PatriciaTriePolicy::getCodePointsAndReturnCodePointCount(const int wordId, + const int maxCodePointCount, int *const outCodePoints) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + readingHelper.initWithPtNodePos(ptNodePos); + const int codePointCount = readingHelper.getCodePointsAndReturnCodePointCount( + maxCodePointCount, outCodePoints); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); + } + return codePointCount; +} + +int Ver4PatriciaTriePolicy::getWordId(const CodePointArrayView wordCodePoints, + const bool forceLowerCaseSearch) const { + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const int ptNodePos = readingHelper.getTerminalPtNodePositionOfWord(wordCodePoints.data(), + wordCodePoints.size(), forceLowerCaseSearch); + if (readingHelper.isError()) { + mIsCorrupted = true; + AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); + } + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_WORD_ID; + } + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (ptNodeParams.isDeleted()) { + return NOT_A_WORD_ID; + } + return ptNodeParams.getTerminalId(); +} + +const WordAttributes Ver4PatriciaTriePolicy::getWordAttributesInContext( + const WordIdArrayView prevWordIds, const int wordId, + MultiBigramMap *const multiBigramMap) const { + if (wordId == NOT_A_WORD_ID) { + return WordAttributes(); + } + return mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + false /* mustMatchAllPrevWords */, mHeaderPolicy); +} + +int Ver4PatriciaTriePolicy::getProbabilityOfWord(const WordIdArrayView prevWordIds, + const int wordId) const { + if (wordId == NOT_A_WORD_ID || prevWordIds.contains(NOT_A_WORD_ID)) { + return NOT_A_PROBABILITY; + } + const WordAttributes wordAttributes = + mBuffers->getLanguageModelDictContent()->getWordAttributes(prevWordIds, wordId, + true /* mustMatchAllPrevWords */, mHeaderPolicy); + if (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return wordAttributes.getProbability(); +} + +BinaryDictionaryShortcutIterator Ver4PatriciaTriePolicy::getShortcutIterator( + const int wordId) const { + const int shortcutPos = getShortcutPositionOfWord(wordId); + return BinaryDictionaryShortcutIterator(&mShortcutPolicy, shortcutPos); +} + +void Ver4PatriciaTriePolicy::iterateNgramEntries(const WordIdArrayView prevWordIds, + NgramListener *const listener) const { + if (prevWordIds.empty()) { + return; + } + const auto languageModelDictContent = mBuffers->getLanguageModelDictContent(); + for (size_t i = 1; i <= prevWordIds.size(); ++i) { + for (const auto entry : languageModelDictContent->getProbabilityEntries( + prevWordIds.limit(i))) { + const ProbabilityEntry &probabilityEntry = entry.getProbabilityEntry(); + if (!probabilityEntry.isValid()) { + continue; + } + int probability = NOT_A_PROBABILITY; + if (probabilityEntry.hasHistoricalInfo()) { + // TODO: Quit checking count here. + // If count <= 1, the word can be an invaild word. The actual probability should + // be checked using getWordAttributesInContext() in onVisitEntry(). + probability = probabilityEntry.getHistoricalInfo()->getCount() <= 1 ? + NOT_A_PROBABILITY : 0; + } else { + probability = probabilityEntry.getProbability(); + } + listener->onVisitEntry(probability, entry.getWordId()); + } + } +} + +int Ver4PatriciaTriePolicy::getShortcutPositionOfWord(const int wordId) const { + if (wordId == NOT_A_WORD_ID) { + return NOT_A_DICT_POS; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); + if (ptNodeParams.isDeleted()) { + return NOT_A_DICT_POS; + } + return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( + ptNodeParams.getTerminalId()); +} + +bool Ver4PatriciaTriePolicy::addUnigramEntry(const CodePointArrayView wordCodePoints, + const UnigramProperty *const unigramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert to the dictionary, length: %zd", + wordCodePoints.size()); + return false; + } + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %zd", + shortcut.getTargetCodePoints()->size()); + return false; + } + } + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + bool addedNewUnigram = false; + int codePointsToAdd[MAX_WORD_LENGTH]; + int codePointCountToAdd = wordCodePoints.size(); + memmove(codePointsToAdd, wordCodePoints.data(), sizeof(int) * codePointCountToAdd); + if (unigramProperty->representsBeginningOfSentence()) { + codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, + codePointCountToAdd, MAX_WORD_LENGTH); + } + if (codePointCountToAdd <= 0) { + return false; + } + const CodePointArrayView codePointArrayView(codePointsToAdd, codePointCountToAdd); + if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointArrayView, unigramProperty, + &addedNewUnigram)) { + if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { + mEntryCounters.incrementNgramCount(NgramType::Unigram); + } + if (unigramProperty->getShortcuts().size() > 0) { + // Add shortcut target. + const int wordId = getWordId(codePointArrayView, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("Cannot find word id to add shortcut target."); + return false; + } + const int wordPos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + for (const auto &shortcut : unigramProperty->getShortcuts()) { + if (!mUpdatingHelper.addShortcutTarget(wordPos, + CodePointArrayView(*shortcut.getTargetCodePoints()), + shortcut.getProbability())) { + AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %zd, " + "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), + shortcut.getProbability()); + return false; + } + } + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeUnigramEntry(const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + const int ptNodePos = + mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId); + const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); + if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { + AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); + return false; + } + if (!mBuffers->getMutableLanguageModelDictContent()->removeProbabilityEntry(wordId)) { + return false; + } + if (!ptNodeParams.representsNonWordInfo()) { + mEntryCounters.decrementNgramCount(NgramType::Unigram); + } + return true; +} + +bool Ver4PatriciaTriePolicy::addNgramEntry(const NgramProperty *const ngramProperty) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + const NgramContext *const ngramContext = ngramProperty->getNgramContext(); + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for adding n-gram entry to the dictionary."); + return false; + } + if (ngramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { + AKLOGE("The word is too long to insert the ngram to the dictionary. " + "length: %zd", ngramProperty->getTargetCodePoints()->size()); + return false; + } + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (prevWordIds.empty()) { + return false; + } + for (size_t i = 0; i < prevWordIds.size(); ++i) { + if (prevWordIds[i] != NOT_A_WORD_ID) { + continue; + } + if (!ngramContext->isNthPrevWordBeginningOfSentence(i + 1 /* n */)) { + return false; + } + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, true /* isNotAWord */, + false /* isBlacklisted */, false /* isPossiblyOffensive */, + MAX_PROBABILITY /* probability */, HistoricalInfo()); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + const int wordId = getWordId(CodePointArrayView(*ngramProperty->getTargetCodePoints()), + false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + bool addedNewEntry = false; + if (mNodeWriter.addNgramEntry(prevWordIds, wordId, ngramProperty, &addedNewEntry)) { + if (addedNewEntry) { + mEntryCounters.incrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + } + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); + return false; + } + if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { + AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", + mDictBuffer->getTailPosition()); + return false; + } + if (!ngramContext->isValid()) { + AKLOGE("Ngram context is not valid for removing n-gram entry form the dictionary."); + return false; + } + if (wordCodePoints.size() > MAX_WORD_LENGTH) { + AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %zd", + wordCodePoints.size()); + } + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSerch */); + if (prevWordIds.empty() || prevWordIds.contains(NOT_A_WORD_ID)) { + return false; + } + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + return false; + } + if (mNodeWriter.removeNgramEntry(prevWordIds, wordId)) { + mEntryCounters.decrementNgramCount( + NgramUtils::getNgramTypeFromWordCount(prevWordIds.size() + 1)); + return true; + } else { + return false; + } +} + +bool Ver4PatriciaTriePolicy::updateEntriesForWordWithNgramContext( + const NgramContext *const ngramContext, const CodePointArrayView wordCodePoints, + const bool isValidWord, const HistoricalInfo historicalInfo) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: updateEntriesForWordWithNgramContext() is called for non-updatable " + "dictionary."); + return false; + } + const bool updateAsAValidWord = ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */) ? + false : isValidWord; + int wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + // The word is not in the dictionary. + const UnigramProperty unigramProperty(false /* representsBeginningOfSentence */, + false /* isNotAWord */, false /* isBlacklisted */, false /* isPossiblyOffensive */, + NOT_A_PROBABILITY, HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, + 0 /* count */)); + if (!addUnigramEntry(wordCodePoints, &unigramProperty)) { + AKLOGE("Cannot add unigarm entry in updateEntriesForWordWithNgramContext()."); + return false; + } + if (!isValidWord) { + return true; + } + wordId = getWordId(wordCodePoints, false /* tryLowerCaseSearch */); + } + + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds(this, &prevWordIdArray, + false /* tryLowerCaseSearch */); + if (ngramContext->isNthPrevWordBeginningOfSentence(1 /* n */)) { + if (prevWordIds.firstOrDefault(NOT_A_WORD_ID) == NOT_A_WORD_ID) { + const UnigramProperty beginningOfSentenceUnigramProperty( + true /* representsBeginningOfSentence */, + true /* isNotAWord */, false /* isPossiblyOffensive */, NOT_A_PROBABILITY, + HistoricalInfo(historicalInfo.getTimestamp(), 0 /* level */, 0 /* count */)); + if (!addUnigramEntry(ngramContext->getNthPrevWordCodePoints(1 /* n */), + &beginningOfSentenceUnigramProperty)) { + AKLOGE("Cannot add BoS entry in updateEntriesForWordWithNgramContext()."); + return false; + } + // Refresh word ids. + ngramContext->getPrevWordIds(this, &prevWordIdArray, false /* tryLowerCaseSearch */); + } + // Update entries for beginning of sentence. + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord( + prevWordIds.skip(1 /* n */), prevWordIds[0], true /* isVaild */, historicalInfo, + mHeaderPolicy, &mEntryCounters)) { + return false; + } + } + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllEntriesOnInputWord(prevWordIds, + wordId, updateAsAValidWord, historicalInfo, mHeaderPolicy, &mEntryCounters)) { + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); + return false; + } + if (!mWritingHelper.writeToDictFile(filePath, mEntryCounters.getEntryCounts())) { + AKLOGE("Cannot flush the dictionary to file."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return false; + } + if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { + AKLOGE("Cannot flush the dictionary to file with GC."); + mIsCorrupted = true; + return false; + } + return true; +} + +bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { + if (!mBuffers->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + if (mBuffers->isNearSizeLimit()) { + // Additional buffer size is near the limit. + return true; + } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() + > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { + // Total extended region size of the trie exceeds the limit. + return true; + } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS + && mDictBuffer->getUsedAdditionalBufferSize() > 0) { + // Needs to reduce dictionary size. + return true; + } else if (mHeaderPolicy->isDecayingDict()) { + return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mEntryCounters.getEntryCounts(), + mHeaderPolicy); + } + return false; +} + +void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, + char *const outResult, const int maxResultLength) { + const int compareLength = queryLength + 1 /* terminator */; + if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mEntryCounters.getNgramCount(NgramType::Unigram)); + } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", mEntryCounters.getNgramCount(NgramType::Bigram)); + } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Unigram)) : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { + snprintf(outResult, maxResultLength, "%d", + mHeaderPolicy->isDecayingDict() ? + ForgettingCurveUtils::getEntryCountHardLimit( + mHeaderPolicy->getMaxNgramCounts().getNgramCount( + NgramType::Bigram)) : + static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); + } +} + +const WordProperty Ver4PatriciaTriePolicy::getWordProperty( + const CodePointArrayView wordCodePoints) const { + const int wordId = getWordId(wordCodePoints, false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) { + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); + } + const LanguageModelDictContent *const languageModelDictContent = + mBuffers->getLanguageModelDictContent(); + // Fetch ngram information. + std::vector<NgramProperty> ngrams; + int ngramTargetCodePoints[MAX_WORD_LENGTH]; + int ngramPrevWordsCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int ngramPrevWordsCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool ngramPrevWordIsBeginningOfSentense[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + for (const auto entry : languageModelDictContent->exportAllNgramEntriesRelatedToWord( + mHeaderPolicy, wordId)) { + const int codePointCount = getCodePointsAndReturnCodePointCount(entry.getTargetWordId(), + MAX_WORD_LENGTH, ngramTargetCodePoints); + const WordIdArrayView prevWordIds = entry.getPrevWordIds(); + for (size_t i = 0; i < prevWordIds.size(); ++i) { + ngramPrevWordsCodePointCount[i] = getCodePointsAndReturnCodePointCount(prevWordIds[i], + MAX_WORD_LENGTH, ngramPrevWordsCodePoints[i]); + ngramPrevWordIsBeginningOfSentense[i] = languageModelDictContent->getProbabilityEntry( + prevWordIds[i]).representsBeginningOfSentence(); + if (ngramPrevWordIsBeginningOfSentense[i]) { + ngramPrevWordsCodePointCount[i] = CharUtils::removeBeginningOfSentenceMarker( + ngramPrevWordsCodePoints[i], ngramPrevWordsCodePointCount[i]); + } + } + const NgramContext ngramContext(ngramPrevWordsCodePoints, ngramPrevWordsCodePointCount, + ngramPrevWordIsBeginningOfSentense, prevWordIds.size()); + const ProbabilityEntry ngramProbabilityEntry = entry.getProbabilityEntry(); + const HistoricalInfo *const historicalInfo = ngramProbabilityEntry.getHistoricalInfo(); + // TODO: Output flags in WordAttributes. + ngrams.emplace_back(ngramContext, + CodePointArrayView(ngramTargetCodePoints, codePointCount).toVector(), + entry.getWordAttributes().getProbability(), *historicalInfo); + } + // Fetch shortcut information. + std::vector<UnigramProperty::ShortcutProperty> shortcuts; + int shortcutPos = getShortcutPositionOfWord(wordId); + if (shortcutPos != NOT_A_DICT_POS) { + int shortcutTarget[MAX_WORD_LENGTH]; + const ShortcutDictContent *const shortcutDictContent = + mBuffers->getShortcutDictContent(); + bool hasNext = true; + while (hasNext) { + int shortcutTargetLength = 0; + int shortcutProbability = NOT_A_PROBABILITY; + shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, + &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); + shortcuts.emplace_back( + CodePointArrayView(shortcutTarget, shortcutTargetLength).toVector(), + shortcutProbability); + } + } + const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes( + WordIdArrayView(), wordId, true /* mustMatchAllPrevWords */, mHeaderPolicy); + const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId); + const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(), + wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(), + wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(), + *historicalInfo, std::move(shortcuts)); + return WordProperty(wordCodePoints.toVector(), unigramProperty, ngrams); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, + int *const outCodePointCount) { + *outCodePointCount = 0; + if (token == 0) { + mTerminalPtNodePositionsForIteratingWords.clear(); + DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( + &mTerminalPtNodePositionsForIteratingWords); + DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); + } + const int terminalPtNodePositionsVectorSize = + static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); + if (token < 0 || token >= terminalPtNodePositionsVectorSize) { + AKLOGE("Given token %d is invalid.", token); + return 0; + } + const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; + const PtNodeParams ptNodeParams = + mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(terminalPtNodePos); + *outCodePointCount = getCodePointsAndReturnCodePointCount(ptNodeParams.getTerminalId(), + MAX_WORD_LENGTH, outCodePoints); + const int nextToken = token + 1; + if (nextToken >= terminalPtNodePositionsVectorSize) { + // All words have been iterated. + mTerminalPtNodePositionsForIteratingWords.clear(); + return 0; + } + return nextToken; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h index faad4290d..d130a4e78 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -20,40 +20,38 @@ #include <vector> #include "defines.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_updating_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/structure/pt_common/dynamic_pt_updating_helper.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "utils/int_array_view.h" namespace latinime { class DicNode; class DicNodeVector; +// Word id = Artificial id that is stored in the PtNode looked up by the word. class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: Ver4PatriciaTriePolicy(Ver4DictBuffers::Ver4DictBuffersPtr buffers) : mBuffers(std::move(buffers)), mHeaderPolicy(mBuffers->getHeaderPolicy()), mDictBuffer(mBuffers->getWritableTrieBuffer()), - mBigramPolicy(mBuffers->getMutableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable(), mHeaderPolicy), mShortcutPolicy(mBuffers->getMutableShortcutDictContent(), mBuffers->getTerminalPositionLookupTable()), - mNodeReader(mDictBuffer, mBuffers->getLanguageModelDictContent(), mHeaderPolicy), - mPtNodeArrayReader(mDictBuffer), - mNodeWriter(mDictBuffer, mBuffers.get(), mHeaderPolicy, &mNodeReader, - &mPtNodeArrayReader, &mBigramPolicy, &mShortcutPolicy), + mNodeReader(mDictBuffer), mPtNodeArrayReader(mDictBuffer), + mNodeWriter(mDictBuffer, mBuffers.get(), &mNodeReader, &mPtNodeArrayReader, + &mShortcutPolicy), mUpdatingHelper(mDictBuffer, &mNodeReader, &mNodeWriter), mWritingHelper(mBuffers.get()), - mUnigramCount(mHeaderPolicy->getUnigramCount()), - mBigramCount(mHeaderPolicy->getBigramCount()), + mEntryCounters(mHeaderPolicy->getNgramCounts().getCountArray()), mTerminalPtNodePositionsForIteratingWords(), mIsCorrupted(false) {}; AK_FORCE_INLINE int getRootPosition() const { @@ -63,40 +61,44 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void createAndGetAllChildDicNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const; - int getCodePointsAndProbabilityAndReturnCodePointCount( - const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const; + int getCodePointsAndReturnCodePointCount(const int wordId, const int maxCodePointCount, + int *const outCodePoints) const; - int getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const; + int getWordId(const CodePointArrayView wordCodePoints, const bool forceLowerCaseSearch) const; - int getProbability(const int unigramProbability, const int bigramProbability) const; + const WordAttributes getWordAttributesInContext(const WordIdArrayView prevWordIds, + const int wordId, MultiBigramMap *const multiBigramMap) const; - int getProbabilityOfPtNode(const int *const prevWordsPtNodePos, const int ptNodePos) const; + // TODO: Remove + int getProbability(const int unigramProbability, const int bigramProbability) const { + // Not used. + return NOT_A_PROBABILITY; + } + + int getProbabilityOfWord(const WordIdArrayView prevWordIds, const int wordId) const; - void iterateNgramEntries(const int *const prevWordsPtNodePos, + void iterateNgramEntries(const WordIdArrayView prevWordIds, NgramListener *const listener) const; - int getShortcutPositionOfPtNode(const int ptNodePos) const; + BinaryDictionaryShortcutIterator getShortcutIterator(const int wordId) const; const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { return mHeaderPolicy; } - const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { - return &mShortcutPolicy; - } - - bool addUnigramEntry(const int *const word, const int length, + bool addUnigramEntry(const CodePointArrayView wordCodePoints, const UnigramProperty *const unigramProperty); - bool removeUnigramEntry(const int *const word, const int length); + bool removeUnigramEntry(const CodePointArrayView wordCodePoints); + + bool addNgramEntry(const NgramProperty *const ngramProperty); - bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty); + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints); - bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word1, - const int length1); + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView wordCodePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); bool flush(const char *const filePath); @@ -107,8 +109,7 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); - const WordProperty getWordProperty(const int *const codePoints, - const int codePointCount) const; + const WordProperty getWordProperty(const CodePointArrayView wordCodePoints) const; int getNextWordAndNextToken(const int token, int *const outCodePoints, int *const outCodePointCount); @@ -132,19 +133,17 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const Ver4DictBuffers::Ver4DictBuffersPtr mBuffers; const HeaderPolicy *const mHeaderPolicy; BufferWithExtendableBuffer *const mDictBuffer; - Ver4BigramListPolicy mBigramPolicy; Ver4ShortcutListPolicy mShortcutPolicy; Ver4PatriciaTrieNodeReader mNodeReader; Ver4PtNodeArrayReader mPtNodeArrayReader; Ver4PatriciaTrieNodeWriter mNodeWriter; DynamicPtUpdatingHelper mUpdatingHelper; Ver4PatriciaTrieWritingHelper mWritingHelper; - int mUnigramCount; - int mBigramCount; + MutableEntryCounters mEntryCounters; std::vector<int> mTerminalPtNodePositionsForIteratingWords; mutable bool mIsCorrupted; - int getBigramsPositionOfPtNode(const int ptNodePos) const; + int getShortcutPositionOfWord(const int wordId) const; }; } // namespace latinime #endif // LATINIME_VER4_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp index 254022db4..ccb70cdd3 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" +#include "dictionary/structure/v4/ver4_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h index 466ff55d5..466ff55d5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_reading_utils.h diff --git a/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..6dfdf4d31 --- /dev/null +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" + +#include <cstring> +#include <queue> + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/structure/v4/ver4_dict_constants.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_reader.h" +#include "dictionary/structure/v4/ver4_patricia_trie_node_writer.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, + const EntryCounts &entryCounts) const { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + entryCounts, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " + "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, trigramCount: %d," + "extendedRegionSize: %d", false, entryCounts.getNgramCount(NgramType::Unigram), + entryCounts.getNgramCount(NgramType::Bigram), + entryCounts.getNgramCount(NgramType::Trigram), + extendedRegionSize); + return false; + } + return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const dictDirPath) { + const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); + Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( + Ver4DictBuffers::createVer4DictBuffers(headerPolicy, + Ver4DictConstants::MAX_DICTIONARY_SIZE)); + MutableEntryCounters entryCounters; + if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &entryCounters)) { + return false; + } + BufferWithExtendableBuffer headerBuffer( + BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + entryCounters.getEntryCounts(), 0 /* extendedRegionSize */, &headerBuffer)) { + return false; + } + return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); +} + +bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, + MutableEntryCounters *const outEntryCounters) { + Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer()); + Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); + Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), + mBuffers->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), + mBuffers, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + + if (!mBuffers->getMutableLanguageModelDictContent()->updateAllProbabilityEntriesForGC( + headerPolicy, outEntryCounters)) { + AKLOGE("Failed to update probabilities in language model dict content."); + return false; + } + if (headerPolicy->isDecayingDict()) { + const EntryCounts &maxEntryCounts = headerPolicy->getMaxNgramCounts(); + if (!mBuffers->getMutableLanguageModelDictContent()->truncateEntries( + outEntryCounters->getEntryCounts(), maxEntryCounts, headerPolicy, + outEntryCounters)) { + AKLOGE("Failed to truncate entries in language model dict content."); + return false; + } + } + + DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + &ptNodeWriter); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &ptNodeReader, &ptNodeArrayReader, &shortcutPolicy); + DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, + buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instances for the GCed dictionary. + Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer()); + Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); + Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), + buffersToWrite->getTerminalPositionLookupTable()); + Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), + buffersToWrite, &newPtNodeReader, &newPtNodeArrayreader, + &newShortcutPolicy); + // Re-assign terminal IDs for valid terminal PtNodes. + TerminalPositionLookupTable::TerminalIdMap terminalIdMap; + if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( + &terminalIdMap)) { + return false; + } + // Run GC for language model dict content. + if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap, + mBuffers->getLanguageModelDictContent())) { + return false; + } + // Run GC for shortcut dict content. + if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, + mBuffers->getShortcutDictContent())) { + return false; + } + DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); + if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { + return false; + } + return true; +} + +bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds + ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { + if (!ptNodeParams->isTerminal()) { + return true; + } + TerminalPositionLookupTable::TerminalIdMap::const_iterator it = + mTerminalIdMap->find(ptNodeParams->getTerminalId()); + if (it == mTerminalIdMap->end()) { + AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", + ptNodeParams->getTerminalId(), mTerminalIdMap->size()); + return false; + } + if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { + AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h index bb464ad28..68dd1caa2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h +++ b/native/jni/src/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h @@ -18,8 +18,9 @@ #define LATINIME_VER4_PATRICIA_TRIE_WRITING_HELPER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/structure/pt_common/dynamic_pt_gc_event_listeners.h" +#include "dictionary/structure/v4/content/terminal_position_lookup_table.h" +#include "dictionary/utils/entry_counters.h" namespace latinime { @@ -33,8 +34,7 @@ class Ver4PatriciaTrieWritingHelper { Ver4PatriciaTrieWritingHelper(Ver4DictBuffers *const buffers) : mBuffers(buffers) {} - bool writeToDictFile(const char *const dictDirPath, const int unigramCount, - const int bigramCount) const; + bool writeToDictFile(const char *const dictDirPath, const EntryCounts &entryCounts) const; // This method cannot be const because the original dictionary buffer will be updated to detect // useless PtNodes during GC. @@ -66,57 +66,8 @@ class Ver4PatriciaTrieWritingHelper { const TerminalPositionLookupTable::TerminalIdMap *const mTerminalIdMap; }; - // For truncateUnigrams() and truncateBigrams(). - class DictProbability { - public: - DictProbability(const int dictPos, const int probability, const int timestamp) - : mDictPos(dictPos), mProbability(probability), mTimestamp(timestamp) {} - - int getDictPos() const { - return mDictPos; - } - - int getProbability() const { - return mProbability; - } - - int getTimestamp() const { - return mTimestamp; - } - - private: - DISALLOW_DEFAULT_CONSTRUCTOR(DictProbability); - - int mDictPos; - int mProbability; - int mTimestamp; - }; - - // For truncateUnigrams() and truncateBigrams(). - class DictProbabilityComparator { - public: - bool operator()(const DictProbability &left, const DictProbability &right) { - if (left.getProbability() != right.getProbability()) { - return left.getProbability() > right.getProbability(); - } - if (left.getTimestamp() != right.getTimestamp()) { - return left.getTimestamp() < right.getTimestamp(); - } - return left.getDictPos() > right.getDictPos(); - } - - private: - DISALLOW_ASSIGNMENT_OPERATOR(DictProbabilityComparator); - }; - bool runGC(const int rootPtNodeArrayPos, const HeaderPolicy *const headerPolicy, - Ver4DictBuffers *const buffersToWrite, int *const outUnigramCount, - int *const outBigramCount); - - bool truncateUnigrams(const Ver4PatriciaTrieNodeReader *const ptNodeReader, - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount); - - bool truncateBigrams(const int maxBigramCount); + Ver4DictBuffers *const buffersToWrite, MutableEntryCounters *const outEntryCounters); Ver4DictBuffers *const mBuffers; }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp index b014c523d..63d0b4ad5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.cpp +++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" +#include "dictionary/structure/v4/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_utils.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/structure/pt_common/dynamic_pt_reading_utils.h" +#include "dictionary/structure/pt_common/patricia_trie_reading_utils.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h index d81808efc..ccb760bc1 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h +++ b/native/jni/src/dictionary/structure/v4/ver4_pt_node_array_reader.h @@ -18,7 +18,7 @@ #define LATINIME_VER4_PT_NODE_ARRAY_READER_H #include "defines.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/pt_node_array_reader.h" +#include "dictionary/structure/pt_common/pt_node_array_reader.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h index 178b06554..8a614730b 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h +++ b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h @@ -18,7 +18,7 @@ #define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H #include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h index 558e0a5c3..a4ddd58c2 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h +++ b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h @@ -18,7 +18,7 @@ #define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H #include "defines.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" namespace latinime { @@ -31,6 +31,11 @@ class BinaryDictionaryShortcutIterator { mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator) + : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy), + mPos(shortcutIterator.mPos), + mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {} + AK_FORCE_INLINE bool hasNextShortcutTarget() const { return mHasNextShortcutTarget; } @@ -45,7 +50,8 @@ class BinaryDictionaryShortcutIterator { } private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryShortcutIterator); + DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator); + DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator); const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; int mPos; diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/dictionary/utils/bloom_filter.h index 1e60f49ed..1e60f49ed 100644 --- a/native/jni/src/suggest/core/dictionary/bloom_filter.h +++ b/native/jni/src/dictionary/utils/bloom_filter.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp index 833063c17..217569651 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -31,7 +31,7 @@ uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) con uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, int *const pos) const { - const int value = readUint(size, *pos); + const uint32_t value = readUint(size, *pos); *pos += size; return value; } @@ -42,8 +42,10 @@ void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxC if (readingPosIsInAdditionalBuffer) { *pos -= mOriginalBuffer.size(); } + // Code point table is not used for dynamic format. *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( - getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, outCodePoints, pos); + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, + nullptr /* codePointTable */, outCodePoints, pos); if (readingPosIsInAdditionalBuffer) { *pos += mOriginalBuffer.size(); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h index fad83aa25..0a141d4db 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h @@ -22,7 +22,7 @@ #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" #include "utils/byte_array_view.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/dictionary/utils/byte_array_utils.cpp index 1833e8832..d38f08217 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp +++ b/native/jni/src/dictionary/utils/byte_array_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h index c0a9fcb1d..abb979050 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ b/native/jni/src/dictionary/utils/byte_array_utils.h @@ -114,7 +114,7 @@ class ByteArrayUtils { return buffer[(*pos)++]; } - static AK_FORCE_INLINE int readUint(const uint8_t *const buffer, + static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, const int size, const int pos) { // size must be in 1 to 4. ASSERT(size >= 1 && size <= 4); @@ -147,11 +147,18 @@ class ByteArrayUtils { */ static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { int p = pos; - return readCodePointAndAdvancePosition(buffer, &p); + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); } static AK_FORCE_INLINE int readCodePointAndAdvancePosition( - const uint8_t *const buffer, int *const pos) { + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ const uint8_t firstByte = readUint8(buffer, *pos); if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { if (firstByte == CHARACTER_ARRAY_TERMINATOR) { @@ -162,6 +169,9 @@ class ByteArrayUtils { } } else { *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } return firstByte; } } @@ -173,12 +183,13 @@ class ByteArrayUtils { */ // Returns the length of the string. static int readStringAndAdvancePosition(const uint8_t *const buffer, - const int maxLength, int *const outBuffer, int *const pos) { + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, pos); + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { outBuffer[length++] = codePoint; - codePoint = readCodePointAndAdvancePosition(buffer, pos); + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); } return length; } @@ -187,9 +198,9 @@ class ByteArrayUtils { static int advancePositionToBehindString( const uint8_t *const buffer, const int maxLength, int *const pos) { int length = 0; - int codePoint = readCodePointAndAdvancePosition(buffer, pos); + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); while (NOT_A_CODE_POINT != codePoint && length < maxLength) { - codePoint = readCodePointAndAdvancePosition(buffer, pos); + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); length++; } return length; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp index b7e2a7278..033a758ba 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" #include <cstdio> #include <errno.h> @@ -22,13 +22,14 @@ #include <sys/stat.h> #include <sys/types.h> -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/backward/v402/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_writing_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" #include "utils/time_keeper.h" namespace latinime { @@ -43,13 +44,13 @@ const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; TimeKeeper::setCurrentTime(); const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); switch (formatVersion) { - case FormatUtils::VERSION_4: + case FormatUtils::VERSION_402: return createEmptyV4DictFile<backward::v402::Ver4DictConstants, backward::v402::Ver4DictBuffers, backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); case FormatUtils::VERSION_4_ONLY_FOR_TESTING: - case FormatUtils::VERSION_4_DEV: + case FormatUtils::VERSION_403: return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, Ver4DictBuffers::Ver4DictBuffersPtr>( filePath, localeAsCodePointVector, attributeMap, formatVersion); @@ -69,8 +70,7 @@ template<class DictConstants, class DictBuffers, class DictBuffersPtr> DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - 0 /* unigramCount */, 0 /* bigramCount */, - 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); + EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); if (!DynamicPtWritingUtils::writeEmptyDictionary( dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/dictionary/utils/dict_file_writing_utils.h index 4843b3b32..102a89da4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.h @@ -20,8 +20,8 @@ #include <cstdio> #include "defines.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" -#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/utils/format_utils.h" namespace latinime { diff --git a/native/jni/src/dictionary/utils/entry_counters.h b/native/jni/src/dictionary/utils/entry_counters.h new file mode 100644 index 000000000..5e443026e --- /dev/null +++ b/native/jni/src/dictionary/utils/entry_counters.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ENTRY_COUNTERS_H +#define LATINIME_ENTRY_COUNTERS_H + +#include <array> + +#include "defines.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Copyable but immutable +class EntryCounts final { + public: + EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} + + explicit EntryCounts(const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters) + : mEntryCounts(counters) {} + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounts[static_cast<int>(ngramType)]; + } + + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &getCountArray() const { + return mEntryCounts; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); + + // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounts; +}; + +class MutableEntryCounters final { + public: + MutableEntryCounters() { + mEntryCounters.fill(0); + } + + explicit MutableEntryCounters( + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters) + : mEntryCounters(counters) {} + + const EntryCounts getEntryCounts() const { + return EntryCounts(mEntryCounters); + } + + void incrementNgramCount(const NgramType ngramType) { + ++mEntryCounters[static_cast<int>(ngramType)]; + } + + void decrementNgramCount(const NgramType ngramType) { + --mEntryCounters[static_cast<int>(ngramType)]; + } + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounters[static_cast<int>(ngramType)]; + } + + void setNgramCount(const NgramType ngramType, const int count) { + mEntryCounters[static_cast<int>(ngramType)] = count; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); + + // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounters; +}; +} // namespace latinime +#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp b/native/jni/src/dictionary/utils/file_utils.cpp index fb80f38c5..bb392fb32 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.cpp +++ b/native/jni/src/dictionary/utils/file_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "dictionary/utils/file_utils.h" #include <cstdio> #include <cstring> diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h b/native/jni/src/dictionary/utils/file_utils.h index 4f1b93a6a..4f1b93a6a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/file_utils.h +++ b/native/jni/src/dictionary/utils/file_utils.h diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp index fed0ae77e..d79ed911b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.cpp +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" +#include "dictionary/utils/forgetting_curve_utils.h" #include <algorithm> #include <cmath> #include <stdlib.h> -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" +#include "dictionary/header/header_policy.h" +#include "dictionary/utils/probability_utils.h" #include "utils/time_keeper.h" namespace latinime { @@ -29,13 +29,16 @@ namespace latinime { const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; -const int ForgettingCurveUtils::MAX_LEVEL = 3; -const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 1; -const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 15; -const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 14; +const int ForgettingCurveUtils::MAX_LEVEL = 15; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; +const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; +const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; +const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; +// TODO: Evaluate whether this should be 7.5 days. +// 15 days +const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; -const float ForgettingCurveUtils::UNIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; -const float ForgettingCurveUtils::BIGRAM_COUNT_HARD_LIMIT_WEIGHT = 1.2; +const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; @@ -43,7 +46,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT /* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( const HistoricalInfo *const originalHistoricalInfo, const int newProbability, const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { - const int timestamp = newHistoricalInfo->getTimeStamp(); + const int timestamp = newHistoricalInfo->getTimestamp(); if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { // Add entry as a valid word. const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); @@ -54,19 +57,23 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { // Initial information. + int count = newHistoricalInfo->getCount(); + if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); + return HistoricalInfo(timestamp, level, 0 /* count */); + } const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); - const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); - return HistoricalInfo(timestamp, level, count); + return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); } else { const int updatedCount = originalHistoricalInfo->getCount() + 1; - if (updatedCount >= headerPolicy->getForgettingCurveOccurrencesToLevelUp()) { + if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { // The count exceeds the max value the level can be incremented. if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { // The level is already max. return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); } else { - // Level up. + // Raise the level. return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel() + 1, 0 /* count */); } @@ -78,66 +85,54 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT /* static */ int ForgettingCurveUtils::decodeProbability( const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { - const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimeStamp(), - headerPolicy->getForgettingCurveDurationToLevelDown()); + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); return sProbabilityTable.getProbability( headerPolicy->getForgettingCurveProbabilityValuesTableId(), clampToValidLevelRange(historicalInfo->getLevel()), clampToValidTimeStepCountRange(elapsedTimeStepCount)); } -/* static */ int ForgettingCurveUtils::getProbability(const int unigramProbability, - const int bigramProbability) { - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return std::min(backoff(unigramProbability), MAX_PROBABILITY); - } else { - // TODO: Investigate better way to handle bigram probability. - return std::min(std::max(unigramProbability, - bigramProbability + MULTIPLIER_TWO_IN_PROBABILITY_SCALE), MAX_PROBABILITY); - } -} - /* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { return historicalInfo->getLevel() > 0 - || getElapsedTimeStepCount(historicalInfo->getTimeStamp(), - headerPolicy->getForgettingCurveDurationToLevelDown()) + || getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; } /* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( const HistoricalInfo *const originalHistoricalInfo, const HeaderPolicy *const headerPolicy) { - if (originalHistoricalInfo->getTimeStamp() == NOT_A_TIMESTAMP) { + if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { return HistoricalInfo(); } - const int durationToLevelDownInSeconds = headerPolicy->getForgettingCurveDurationToLevelDown(); + const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; const int elapsedTimeStep = getElapsedTimeStepCount( - originalHistoricalInfo->getTimeStamp(), durationToLevelDownInSeconds); + originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { // No need to update historical info. return *originalHistoricalInfo; } - // Level down. + // Lower the level. const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? originalHistoricalInfo->getLevel() : maxLevelDownAmonut; - const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimeStamp() + + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + levelDownAmount * durationToLevelDownInSeconds; return HistoricalInfo(adjustedTimestampInSeconds, originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); } /* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, - const int unigramCount, const int bigramCount, const HeaderPolicy *const headerPolicy) { - if (unigramCount >= getUnigramCountHardLimit(headerPolicy->getMaxUnigramCount())) { - // Unigram count exceeds the limit. - return true; - } else if (bigramCount >= getBigramCountHardLimit(headerPolicy->getMaxBigramCount())) { - // Bigram count exceeds the limit. - return true; + const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { + const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + if (entryCounts.getNgramCount(ngramType) + >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { + // Unigram count exceeds the limit. + return true; + } } if (mindsBlockByDecay) { return false; @@ -170,7 +165,7 @@ const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityT /* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy) { - return std::min(std::max(count, 0), headerPolicy->getForgettingCurveOccurrencesToLevelUp() - 1); + return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); } /* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { @@ -187,9 +182,9 @@ const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; -const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 32; -const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 35; -const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 40; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { @@ -202,7 +197,7 @@ ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; ++timeStepCount) { - if (level == 0) { + if (level < MIN_VISIBLE_LEVEL) { mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; continue; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/dictionary/utils/forgetting_curve_utils.h index 9910777b8..ddaac7e3b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.h @@ -20,7 +20,8 @@ #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/historical_info.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/utils/entry_counters.h" namespace latinime { @@ -39,23 +40,20 @@ class ForgettingCurveUtils { static int decodeProbability(const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy); - static int getProbability(const int encodedUnigramProbability, - const int encodedBigramProbability); - static bool needsToKeep(const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy); - static bool needsToDecay(const bool mindsBlockByDecay, const int unigramCount, - const int bigramCount, const HeaderPolicy *const headerPolicy); + static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, + const HeaderPolicy *const headerPolicy); - AK_FORCE_INLINE static int getUnigramCountHardLimit(const int maxUnigramCount) { - return static_cast<int>(static_cast<float>(maxUnigramCount) - * UNIGRAM_COUNT_HARD_LIMIT_WEIGHT); + // TODO: Improve probability computation method and remove this. + static int getProbabilityBiasForNgram(const int n) { + return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; } - AK_FORCE_INLINE static int getBigramCountHardLimit(const int maxBigramCount) { - return static_cast<int>(static_cast<float>(maxBigramCount) - * BIGRAM_COUNT_HARD_LIMIT_WEIGHT); + AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { + return static_cast<int>(static_cast<float>(maxEntryCount) + * ENTRY_COUNT_HARD_LIMIT_WEIGHT); } private: @@ -96,9 +94,10 @@ class ForgettingCurveUtils { static const int MIN_VISIBLE_LEVEL; static const int MAX_ELAPSED_TIME_STEP_COUNT; static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int OCCURRENCES_TO_RAISE_THE_LEVEL; + static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; - static const float UNIGRAM_COUNT_HARD_LIMIT_WEIGHT; - static const float BIGRAM_COUNT_HARD_LIMIT_WEIGHT; + static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; static const ProbabilityTable sProbabilityTable; diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/dictionary/utils/format_utils.cpp index 1916ea560..cef3b094c 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/dictionary/utils/format_utils.cpp @@ -14,40 +14,44 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "dictionary/utils/format_utils.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "dictionary/utils/byte_array_utils.h" namespace latinime { const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 -const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; +const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; /* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { switch (formatVersion) { case VERSION_2: - return VERSION_2; + case VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; case VERSION_4_ONLY_FOR_TESTING: return VERSION_4_ONLY_FOR_TESTING; - case VERSION_4: - return VERSION_4; - case VERSION_4_DEV: - return VERSION_4_DEV; + case VERSION_402: + return VERSION_402; + case VERSION_403: + return VERSION_403; default: return UNKNOWN_VERSION; } } /* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( - const uint8_t *const dict, const int dictSize) { + const ReadOnlyByteArrayView dictBuffer) { // The magic number is stored big-endian. // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't // understand this format. - if (dictSize < DICTIONARY_MINIMUM_SIZE) { + if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) { return UNKNOWN_VERSION; } - const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); + const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0); switch (magicNumber) { case MAGIC_NUMBER: // The layout of the header is as follows: @@ -58,7 +62,7 @@ const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; // Conceptually this converts the hardcoded value of the bytes in the file into // the symbolic value we use in the code. But we want the constants to be the // same so we use them for both here. - return getFormatVersion(ByteArrayUtils::readUint16(dict, 4)); + return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4)); default: return UNKNOWN_VERSION; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/dictionary/utils/format_utils.h index 55ad5799f..1616efcce 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/dictionary/utils/format_utils.h @@ -20,6 +20,7 @@ #include <cstdint> #include "defines.h" +#include "utils/byte_array_view.h" namespace latinime { @@ -30,10 +31,15 @@ class FormatUtils { public: enum FORMAT_VERSION { // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool VERSION_2 = 2, + VERSION_201 = 201, + VERSION_202 = 202, VERSION_4_ONLY_FOR_TESTING = 399, - VERSION_4 = 402, - VERSION_4_DEV = 403, + VERSION_402 = 402, + VERSION_403 = 403, UNKNOWN_VERSION = -1 }; @@ -42,12 +48,12 @@ class FormatUtils { static const uint32_t MAGIC_NUMBER; static FORMAT_VERSION getFormatVersion(const int formatVersion); - static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); + static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer); private: DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); - static const int DICTIONARY_MINIMUM_SIZE; + static const size_t DICTIONARY_MINIMUM_SIZE; }; } // namespace latinime #endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/dictionary/utils/mmapped_buffer.cpp index 4a126ff85..c5259de6d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.cpp +++ b/native/jni/src/dictionary/utils/mmapped_buffer.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" +#include "dictionary/utils/mmapped_buffer.h" #include <cerrno> #include <climits> @@ -23,7 +23,7 @@ #include <sys/mman.h> #include <unistd.h> -#include "suggest/policyimpl/dictionary/utils/file_utils.h" +#include "dictionary/utils/file_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/dictionary/utils/mmapped_buffer.h index e25310373..e25310373 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h +++ b/native/jni/src/dictionary/utils/mmapped_buffer.h diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp b/native/jni/src/dictionary/utils/multi_bigram_map.cpp index 91f33a8dd..e730fff8e 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.cpp +++ b/native/jni/src/dictionary/utils/multi_bigram_map.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/core/dictionary/multi_bigram_map.h" +#include "dictionary/utils/multi_bigram_map.h" #include <cstddef> #include <unordered_map> @@ -35,39 +35,37 @@ const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = // Also caches the bigrams if there is space remaining and they have not been cached already. int MultiBigramMap::getBigramProbability( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos, const int nextWordPosition, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { - if (!prevWordsPtNodePos || prevWordsPtNodePos[0] == NOT_A_DICT_POS) { + if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) { return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); } - std::unordered_map<int, BigramMap>::const_iterator mapPosition = - mBigramMaps.find(prevWordsPtNodePos[0]); + const auto mapPosition = mBigramMaps.find(prevWordIds[0]); if (mapPosition != mBigramMaps.end()) { - return mapPosition->second.getBigramProbability(structurePolicy, nextWordPosition, + return mapPosition->second.getBigramProbability(structurePolicy, nextWordId, unigramProbability); } if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWordPosition(structurePolicy, prevWordsPtNodePos); - return mBigramMaps[prevWordsPtNodePos[0]].getBigramProbability(structurePolicy, - nextWordPosition, unigramProbability); + addBigramsForWord(structurePolicy, prevWordIds); + return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy, + nextWordId, unigramProbability); } - return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordsPtNodePos, - nextWordPosition, unigramProbability); + return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds, + nextWordId, unigramProbability); } void MultiBigramMap::BigramMap::init( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos) { - structurePolicy->iterateNgramEntries(prevWordsPtNodePos, this /* listener */); + const WordIdArrayView prevWordIds) { + structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */); } int MultiBigramMap::BigramMap::getBigramProbability( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nextWordPosition, const int unigramProbability) const { + const int nextWordId, const int unigramProbability) const { int bigramProbability = NOT_A_PROBABILITY; - if (mBloomFilter.isInFilter(nextWordPosition)) { - const std::unordered_map<int, int>::const_iterator bigramProbabilityIt = - mBigramMap.find(nextWordPosition); + if (mBloomFilter.isInFilter(nextWordId)) { + const auto bigramProbabilityIt = mBigramMap.find(nextWordId); if (bigramProbabilityIt != mBigramMap.end()) { bigramProbability = bigramProbabilityIt->second; } @@ -75,29 +73,24 @@ int MultiBigramMap::BigramMap::getBigramProbability( return structurePolicy->getProbability(unigramProbability, bigramProbability); } -void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, - const int targetPtNodePos) { - if (targetPtNodePos == NOT_A_DICT_POS) { +void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { return; } - mBigramMap[targetPtNodePos] = ngramProbability; - mBloomFilter.setInFilter(targetPtNodePos); + mBigramMap[targetWordId] = ngramProbability; + mBloomFilter.setInFilter(targetWordId); } -void MultiBigramMap::addBigramsForWordPosition( +void MultiBigramMap::addBigramsForWord( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos) { - if (prevWordsPtNodePos) { - mBigramMaps[prevWordsPtNodePos[0]].init(structurePolicy, prevWordsPtNodePos); - } + const WordIdArrayView prevWordIds) { + mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds); } int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos, const int nextWordPosition, - const int unigramProbability) { - const int bigramProbability = structurePolicy->getProbabilityOfPtNode(prevWordsPtNodePos, - nextWordPosition); + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { + const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId); if (bigramProbability != NOT_A_PROBABILITY) { return bigramProbability; } diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/dictionary/utils/multi_bigram_map.h index ad36dde83..6f23d98bc 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/dictionary/utils/multi_bigram_map.h @@ -21,10 +21,11 @@ #include <unordered_map> #include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/bloom_filter.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/bloom_filter.h" +#include "utils/int_array_view.h" namespace latinime { @@ -39,8 +40,7 @@ class MultiBigramMap { // Look up the bigram probability for the given word pair from the cached bigram maps. // Also caches the bigrams if there is space remaining and they have not been cached already. int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos, const int nextWordPosition, - const int unigramProbability); + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); void clear() { mBigramMaps.clear(); @@ -58,11 +58,11 @@ class MultiBigramMap { virtual ~BigramMap() {} void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos); + const WordIdArrayView prevWordIds); int getBigramProbability( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int nextWordPosition, const int unigramProbability) const; - virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos); + const int nextWordId, const int unigramProbability) const; + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); private: static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; @@ -70,14 +70,12 @@ class MultiBigramMap { BloomFilter mBloomFilter; }; - void addBigramsForWordPosition( - const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos); + void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); int readBigramProbabilityFromBinaryDictionary( const DictionaryStructureWithBufferPolicy *const structurePolicy, - const int *const prevWordsPtNodePos, const int nextWordPosition, - const int unigramProbability); + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; std::unordered_map<int, BigramMap> mBigramMaps; diff --git a/native/jni/src/dictionary/utils/probability_utils.cpp b/native/jni/src/dictionary/utils/probability_utils.cpp new file mode 100644 index 000000000..426a0e783 --- /dev/null +++ b/native/jni/src/dictionary/utils/probability_utils.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +namespace latinime { + +const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/dictionary/utils/probability_utils.h index 3b339e61a..2050af1e9 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h +++ b/native/jni/src/dictionary/utils/probability_utils.h @@ -17,6 +17,9 @@ #ifndef LATINIME_PROBABILITY_UTILS_H #define LATINIME_PROBABILITY_UTILS_H +#include <algorithm> +#include <cmath> + #include "defines.h" namespace latinime { @@ -47,8 +50,20 @@ class ProbabilityUtils { + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); } + // Encode probability using the same way as we are doing for main dictionaries. + static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { + const float probability = static_cast<float>(MAX_PROBABILITY) + + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; + if (probability < 0.0f) { + return 0; + } + return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY); + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); + + static const float PROBABILITY_ENCODING_SCALER; }; } #endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp b/native/jni/src/dictionary/utils/sparse_table.cpp index d336306b9..029329fab 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.cpp +++ b/native/jni/src/dictionary/utils/sparse_table.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/sparse_table.h" +#include "dictionary/utils/sparse_table.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h b/native/jni/src/dictionary/utils/sparse_table.h index fca8120f1..bd1190e8b 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/sparse_table.h +++ b/native/jni/src/dictionary/utils/sparse_table.h @@ -20,11 +20,10 @@ #include <cstdint> #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { -// Note that there is a corresponding implementation in SparseTable.java. // TODO: Support multiple content buffers. class SparseTable { public: diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp b/native/jni/src/dictionary/utils/trie_map.cpp index 407b8efd0..0bef8c702 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.cpp +++ b/native/jni/src/dictionary/utils/trie_map.cpp @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/trie_map.h" +#include "dictionary/utils/trie_map.h" -#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "dictionary/utils/dict_file_writing_utils.h" namespace latinime { @@ -26,6 +26,7 @@ const int TrieMap::FIELD1_SIZE = 3; const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE; const uint32_t TrieMap::VALUE_FLAG = 0x400000; const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF; +const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK; const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000; const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF; const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5; @@ -34,6 +35,7 @@ const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_O const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0; const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE; const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0); +const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry. const uint64_t TrieMap::MAX_VALUE = (static_cast<uint64_t>(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1; const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE; @@ -76,14 +78,17 @@ int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIn return terminalEntry.getValueEntryIndex() + 1; } // Create a value entry and a bitmap entry. - const int valueEntryIndex = allocateTable(2 /* entryCount */); + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return INVALID_INDEX; + } if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) { return INVALID_INDEX; } if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { return INVALID_INDEX; } - if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, valueEntryIndex)) { + if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) { return INVALID_INDEX; } return valueEntryIndex + 1; @@ -108,6 +113,31 @@ bool TrieMap::save(FILE *const file) const { return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer); } +bool TrieMap::remove(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast<uint32_t>(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return false; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) { + return false; + } + if (terminalEntry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1); + if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)){ + return false; + } + } + return true; +} + /** * Iterate next entry in a certain level. * @@ -129,7 +159,7 @@ const TrieMap::Result TrieMap::iterateNext(std::vector<TableIterationState> *con if (entry.isBitmapEntry()) { // Move to child. iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex()); - } else { + } else if (entry.isValidTerminalEntry()) { if (outKey) { *outKey = entry.getKey(); } @@ -162,12 +192,15 @@ uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const { } bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) { - if (value <= VALUE_MASK) { + if (value < VALUE_MASK) { // Write value into the terminal entry. return writeField1(value | VALUE_FLAG, terminalEntryIndex); } // Create value entry and write value. - const int valueEntryIndex = allocateTable(2 /* entryCount */); + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return false; + } if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) { return false; } @@ -227,6 +260,9 @@ int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, // Move to the next level. return getTerminalEntryIndex(key, hashedKey, entry, level + 1); } + if (!entry.isValidTerminalEntry()) { + return INVALID_INDEX; + } if (entry.getKey() == key) { // Terminal entry is found. return entryIndex; @@ -287,6 +323,10 @@ bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32 // Bitmap entry is found. Go to the next level. return putInternal(key, value, hashedKey, entryIndex, entry, level + 1); } + if (!entry.isValidTerminalEntry()) { + // Overwrite invalid terminal entry. + return writeTerminalEntry(key, value, entryIndex); + } if (entry.getKey() == key) { // Terminal entry for the key is found. Update the value. return updateValue(entry, value, entryIndex); @@ -384,4 +424,37 @@ bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t val return true; } +bool TrieMap::removeInner(const Entry &bitmapEntry) { + const int tableSize = popCount(bitmapEntry.getBitmap()); + if (tableSize <= 0) { + // The table is empty. No need to remove any entries. + return true; + } + for (int i = 0; i < tableSize; ++i) { + const int entryIndex = bitmapEntry.getTableIndex() + i; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Delete next bitmap entry recursively. + if (!removeInner(entry)) { + return false; + } + } else { + // Invalidate terminal entry just in case. + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) { + return false; + } + if (entry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1); + if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)) { + return false; + } + } + } + } + return true; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h b/native/jni/src/dictionary/utils/trie_map.h index 3e5c4010c..5fc6c2690 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/trie_map.h +++ b/native/jni/src/dictionary/utils/trie_map.h @@ -23,7 +23,7 @@ #include <vector> #include "defines.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" #include "utils/byte_array_view.h" namespace latinime { @@ -84,6 +84,10 @@ class TrieMap { return mValue; } + AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const { + return mNextLevelBitmapEntryIndex; + } + private: const TrieMap *const mTrieMap; const int mKey; @@ -94,7 +98,7 @@ class TrieMap { TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex) : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex), mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) { - if (!trieMap) { + if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) { return; } const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex); @@ -202,6 +206,8 @@ class TrieMap { bool save(FILE *const file) const; + bool remove(const int key, const int bitmapEntryIndex); + private: DISALLOW_COPY_AND_ASSIGN(TrieMap); @@ -245,6 +251,11 @@ class TrieMap { } // For terminal entry. + AK_FORCE_INLINE bool isValidTerminalEntry() const { + return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY); + } + + // For terminal entry. AK_FORCE_INLINE uint32_t getValueEntryIndex() const { return mData1 & TERMINAL_LINK_MASK; } @@ -272,6 +283,7 @@ class TrieMap { static const int ENTRY_SIZE; static const uint32_t VALUE_FLAG; static const uint32_t VALUE_MASK; + static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY; static const uint32_t TERMINAL_LINK_FLAG; static const uint32_t TERMINAL_LINK_MASK; static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL; @@ -280,6 +292,7 @@ class TrieMap { static const int ROOT_BITMAP_ENTRY_INDEX; static const int ROOT_BITMAP_ENTRY_POS; static const Entry EMPTY_BITMAP_ENTRY; + static const int TERMINAL_LINKED_ENTRY_COUNT; static const int MAX_BUFFER_SIZE; uint32_t getBitShuffledKey(const uint32_t key) const; @@ -378,6 +391,8 @@ class TrieMap { AK_FORCE_INLINE int getTailEntryIndex() const { return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE; } + + bool removeInner(const Entry &bitmapEntry); }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h index d1b2c87be..5214077dc 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node.h +++ b/native/jni/src/suggest/core/dicnode/dic_node.h @@ -26,6 +26,7 @@ #include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/layout/proximity_info_state.h" #include "utils/char_utils.h" +#include "utils/int_array_view.h" #if DEBUG_DICT #define LOGI_SHOW_ADD_COST_PROP \ @@ -103,10 +104,10 @@ class DicNode { PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } - // Init for root with prevWordsPtNodePos which is used for n-gram - void initAsRoot(const int rootPtNodeArrayPos, const int *const prevWordsPtNodePos) { + // Init for root with prevWordIds which is used for n-gram + void initAsRoot(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) { mIsCachedForNextSuggestion = false; - mDicNodeProperties.init(rootPtNodeArrayPos, prevWordsPtNodePos); + mDicNodeProperties.init(rootPtNodeArrayPos, prevWordIds); mDicNodeState.init(); PROF_NODE_RESET(mProfiler); } @@ -114,12 +115,11 @@ class DicNode { // Init for root with previous word void initAsRootWithPreviousWord(const DicNode *const dicNode, const int rootPtNodeArrayPos) { mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; - int newPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - newPrevWordsPtNodePos[0] = dicNode->mDicNodeProperties.getPtNodePos(); - for (size_t i = 1; i < NELEMS(newPrevWordsPtNodePos); ++i) { - newPrevWordsPtNodePos[i] = dicNode->getPrevWordsTerminalPtNodePos()[i - 1]; - } - mDicNodeProperties.init(rootPtNodeArrayPos, newPrevWordsPtNodePos); + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> newPrevWordIds; + newPrevWordIds[0] = dicNode->mDicNodeProperties.getWordId(); + dicNode->getPrevWordIds().limit(newPrevWordIds.size() - 1) + .copyToArray(&newPrevWordIds, 1 /* offset */); + mDicNodeProperties.init(rootPtNodeArrayPos, WordIdArrayView::fromArray(newPrevWordIds)); mDicNodeState.initAsRootWithPreviousWord(&dicNode->mDicNodeState, dicNode->mDicNodeProperties.getDepth()); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); @@ -135,19 +135,16 @@ class DicNode { PROF_NODE_COPY(&parentDicNode->mProfiler, mProfiler); } - void initAsChild(const DicNode *const dicNode, const int ptNodePos, - const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, - const bool hasChildren, const bool isBlacklistedOrNotAWord, - const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { + void initAsChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos, + const int wordId, const CodePointArrayView mergedCodePoints) { uint16_t newDepth = static_cast<uint16_t>(dicNode->getNodeCodePointCount() + 1); mIsCachedForNextSuggestion = dicNode->mIsCachedForNextSuggestion; const uint16_t newLeavingDepth = static_cast<uint16_t>( - dicNode->mDicNodeProperties.getLeavingDepth() + mergedNodeCodePointCount); - mDicNodeProperties.init(ptNodePos, childrenPtNodeArrayPos, mergedNodeCodePoints[0], - probability, isTerminal, hasChildren, isBlacklistedOrNotAWord, newDepth, - newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordsTerminalPtNodePos()); - mDicNodeState.init(&dicNode->mDicNodeState, mergedNodeCodePointCount, - mergedNodeCodePoints); + dicNode->mDicNodeProperties.getLeavingDepth() + mergedCodePoints.size()); + mDicNodeProperties.init(childrenPtNodeArrayPos, mergedCodePoints[0], + wordId, newDepth, newLeavingDepth, dicNode->mDicNodeProperties.getPrevWordIds()); + mDicNodeState.init(&dicNode->mDicNodeState, mergedCodePoints.size(), + mergedCodePoints.data()); PROF_NODE_COPY(&dicNode->mProfiler, mProfiler); } @@ -179,9 +176,6 @@ class DicNode { // Check if the current word and the previous word can be considered as a valid multiple word // suggestion. bool isValidMultipleWordSuggestion() const { - if (isBlacklistedOrNotAWord()) { - return false; - } // Treat suggestion as invalid if the current and the previous word are single character // words. const int prevWordLen = mDicNodeState.mDicNodeStateOutput.getPrevWordsLength() @@ -204,13 +198,12 @@ class DicNode { } // Used to get n-gram probability in DicNodeUtils. - int getPtNodePos() const { - return mDicNodeProperties.getPtNodePos(); + int getWordId() const { + return mDicNodeProperties.getWordId(); } - // TODO: Use view class to return PtNodePos array. - const int *getPrevWordsTerminalPtNodePos() const { - return mDicNodeProperties.getPrevWordsTerminalPtNodePos(); + const WordIdArrayView getPrevWordIds() const { + return mDicNodeProperties.getPrevWordIds(); } // Used in DicNodeUtils @@ -218,10 +211,6 @@ class DicNode { return mDicNodeProperties.getChildrenPtNodeArrayPos(); } - int getProbability() const { - return mDicNodeProperties.getProbability(); - } - AK_FORCE_INLINE bool isTerminalDicNode() const { const bool isTerminalPtNode = mDicNodeProperties.isTerminal(); const int currentDicNodeDepth = getNodeCodePointCount(); @@ -306,8 +295,9 @@ class DicNode { } // Used to prune nodes - float getCompoundDistance(const float languageWeight) const { - return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight); + float getCompoundDistance(const float weightOfLangModelVsSpatialModel) const { + return mDicNodeState.mDicNodeStateScoring.getCompoundDistance( + weightOfLangModelVsSpatialModel); } AK_FORCE_INLINE const int *getOutputWordBuf() const { @@ -404,10 +394,6 @@ class DicNode { return mDicNodeState.mDicNodeStateScoring.getContainedErrorTypes(); } - bool isBlacklistedOrNotAWord() const { - return mDicNodeProperties.isBlacklistedOrNotAWord(); - } - inline uint16_t getNodeCodePointCount() const { return mDicNodeProperties.getDepth(); } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 69ea67418..a20252cd2 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -16,10 +16,9 @@ #include "suggest/core/dicnode/dic_node_utils.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/multi_bigram_map.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" namespace latinime { @@ -29,8 +28,8 @@ namespace latinime { /* static */ void DicNodeUtils::initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int *const prevWordsPtNodePos, DicNode *const newRootDicNode) { - newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordsPtNodePos); + const WordIdArrayView prevWordIds, DicNode *const newRootDicNode) { + newRootDicNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordIds); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord( @@ -73,25 +72,17 @@ namespace latinime { if (dicNode->hasMultipleWords() && !dicNode->isValidMultipleWordSuggestion()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } - const int probability = getBigramNodeProbability(dictionaryStructurePolicy, dicNode, - multiBigramMap); + const WordAttributes wordAttributes = dictionaryStructurePolicy->getWordAttributesInContext( + dicNode->getPrevWordIds(), dicNode->getWordId(), multiBigramMap); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY + || (dicNode->hasMultipleWords() + && (wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()))) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. - const float cost = static_cast<float>(MAX_PROBABILITY - probability) + const float cost = static_cast<float>(MAX_PROBABILITY - wordAttributes.getProbability()) / static_cast<float>(MAX_PROBABILITY); return cost; } -/* static */ int DicNodeUtils::getBigramNodeProbability( - const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) { - const int unigramProbability = dicNode->getProbability(); - if (multiBigramMap) { - const int *const prevWordsPtNodePos = dicNode->getPrevWordsTerminalPtNodePos(); - return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, - prevWordsPtNodePos, dicNode->getPtNodePos(), unigramProbability); - } - return dictionaryStructurePolicy->getProbability(unigramProbability, - NOT_A_PROBABILITY); -} - } // namespace latinime diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 00e80c604..b891a842a 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -18,6 +18,7 @@ #define LATINIME_DIC_NODE_UTILS_H #include "defines.h" +#include "utils/int_array_view.h" namespace latinime { @@ -30,7 +31,7 @@ class DicNodeUtils { public: static void initAsRoot( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int *const prevWordPtNodePos, DicNode *const newRootDicNode); + const WordIdArrayView prevWordIds, DicNode *const newRootDicNode); static void initAsRootWithPreviousWord( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const prevWordLastDicNode, DicNode *const newRootDicNode); @@ -46,10 +47,6 @@ class DicNodeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); // Max number of bigrams to look up static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; - - static int getBigramNodeProbability( - const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const DicNode *const dicNode, MultiBigramMap *const multiBigramMap); }; } // namespace latinime #endif // LATINIME_DIC_NODE_UTILS_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_vector.h b/native/jni/src/suggest/core/dicnode/dic_node_vector.h index 54cde1988..e6b758954 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_vector.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_vector.h @@ -21,6 +21,7 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node.h" +#include "utils/int_array_view.h" namespace latinime { @@ -58,15 +59,11 @@ class DicNodeVector { mDicNodes.back().initAsPassingChild(dicNode); } - void pushLeavingChild(const DicNode *const dicNode, const int ptNodePos, - const int childrenPtNodeArrayPos, const int probability, const bool isTerminal, - const bool hasChildren, const bool isBlacklistedOrNotAWord, - const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { + void pushLeavingChild(const DicNode *const dicNode, const int childrenPtNodeArrayPos, + const int wordId, const CodePointArrayView mergedCodePoints) { ASSERT(!mLock); mDicNodes.emplace_back(); - mDicNodes.back().initAsChild(dicNode, ptNodePos, childrenPtNodeArrayPos, probability, - isTerminal, hasChildren, isBlacklistedOrNotAWord, mergedNodeCodePointCount, - mergedNodeCodePoints); + mDicNodes.back().initAsChild(dicNode, childrenPtNodeArrayPos, wordId, mergedCodePoints); } DicNode *operator[](const int id) { diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h index 8202176f7..1b796b5d4 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_properties.h @@ -18,8 +18,10 @@ #define LATINIME_DIC_NODE_PROPERTIES_H #include <cstdint> +#include <cstdlib> #include "defines.h" +#include "utils/int_array_view.h" namespace latinime { @@ -29,84 +31,61 @@ namespace latinime { class DicNodeProperties { public: AK_FORCE_INLINE DicNodeProperties() - : mPtNodePos(NOT_A_DICT_POS), mChildrenPtNodeArrayPos(NOT_A_DICT_POS), - mProbability(NOT_A_PROBABILITY), mDicNodeCodePoint(NOT_A_CODE_POINT), - mIsTerminal(false), mHasChildrenPtNodes(false), - mIsBlacklistedOrNotAWord(false), mDepth(0), mLeavingDepth(0) {} + : mChildrenPtNodeArrayPos(NOT_A_DICT_POS), mDicNodeCodePoint(NOT_A_CODE_POINT), + mWordId(NOT_A_WORD_ID), mDepth(0), mLeavingDepth(0), mPrevWordCount(0) {} ~DicNodeProperties() {} // Should be called only once per DicNode is initialized. - void init(const int pos, const int childrenPos, const int nodeCodePoint, const int probability, - const bool isTerminal, const bool hasChildren, const bool isBlacklistedOrNotAWord, - const uint16_t depth, const uint16_t leavingDepth, const int *const prevWordsNodePos) { - mPtNodePos = pos; + void init(const int childrenPos, const int nodeCodePoint, const int wordId, + const uint16_t depth, const uint16_t leavingDepth, const WordIdArrayView prevWordIds) { mChildrenPtNodeArrayPos = childrenPos; mDicNodeCodePoint = nodeCodePoint; - mProbability = probability; - mIsTerminal = isTerminal; - mHasChildrenPtNodes = hasChildren; - mIsBlacklistedOrNotAWord = isBlacklistedOrNotAWord; + mWordId = wordId; mDepth = depth; mLeavingDepth = leavingDepth; - memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos)); + prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIds.size(); } // Init for root with prevWordsPtNodePos which is used for n-gram - void init(const int rootPtNodeArrayPos, const int *const prevWordsNodePos) { - mPtNodePos = NOT_A_DICT_POS; + void init(const int rootPtNodeArrayPos, const WordIdArrayView prevWordIds) { mChildrenPtNodeArrayPos = rootPtNodeArrayPos; mDicNodeCodePoint = NOT_A_CODE_POINT; - mProbability = NOT_A_PROBABILITY; - mIsTerminal = false; - mHasChildrenPtNodes = true; - mIsBlacklistedOrNotAWord = false; + mWordId = NOT_A_WORD_ID; mDepth = 0; mLeavingDepth = 0; - memmove(mPrevWordsTerminalPtNodePos, prevWordsNodePos, sizeof(mPrevWordsTerminalPtNodePos)); + prevWordIds.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIds.size(); } void initByCopy(const DicNodeProperties *const dicNodeProp) { - mPtNodePos = dicNodeProp->mPtNodePos; mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; mDicNodeCodePoint = dicNodeProp->mDicNodeCodePoint; - mProbability = dicNodeProp->mProbability; - mIsTerminal = dicNodeProp->mIsTerminal; - mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; - mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; + mWordId = dicNodeProp->mWordId; mDepth = dicNodeProp->mDepth; mLeavingDepth = dicNodeProp->mLeavingDepth; - memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos, - sizeof(mPrevWordsTerminalPtNodePos)); + const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds(); + prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIdArrayView.size(); } // Init as passing child void init(const DicNodeProperties *const dicNodeProp, const int codePoint) { - mPtNodePos = dicNodeProp->mPtNodePos; mChildrenPtNodeArrayPos = dicNodeProp->mChildrenPtNodeArrayPos; mDicNodeCodePoint = codePoint; // Overwrite the node char of a passing child - mProbability = dicNodeProp->mProbability; - mIsTerminal = dicNodeProp->mIsTerminal; - mHasChildrenPtNodes = dicNodeProp->mHasChildrenPtNodes; - mIsBlacklistedOrNotAWord = dicNodeProp->mIsBlacklistedOrNotAWord; + mWordId = dicNodeProp->mWordId; mDepth = dicNodeProp->mDepth + 1; // Increment the depth of a passing child mLeavingDepth = dicNodeProp->mLeavingDepth; - memmove(mPrevWordsTerminalPtNodePos, dicNodeProp->mPrevWordsTerminalPtNodePos, - sizeof(mPrevWordsTerminalPtNodePos)); - } - - int getPtNodePos() const { - return mPtNodePos; + const WordIdArrayView prevWordIdArrayView = dicNodeProp->getPrevWordIds(); + prevWordIdArrayView.copyToArray(&mPrevWordIds, 0 /* offset */); + mPrevWordCount = prevWordIdArrayView.size(); } int getChildrenPtNodeArrayPos() const { return mChildrenPtNodeArrayPos; } - int getProbability() const { - return mProbability; - } - int getDicNodeCodePoint() const { return mDicNodeCodePoint; } @@ -121,35 +100,32 @@ class DicNodeProperties { } bool isTerminal() const { - return mIsTerminal; + return mWordId != NOT_A_WORD_ID; } bool hasChildren() const { - return mHasChildrenPtNodes || mDepth != mLeavingDepth; + return (mChildrenPtNodeArrayPos != NOT_A_DICT_POS) || mDepth != mLeavingDepth; } - bool isBlacklistedOrNotAWord() const { - return mIsBlacklistedOrNotAWord; + const WordIdArrayView getPrevWordIds() const { + return WordIdArrayView::fromArray(mPrevWordIds).limit(mPrevWordCount); } - const int *getPrevWordsTerminalPtNodePos() const { - return mPrevWordsTerminalPtNodePos; + int getWordId() const { + return mWordId; } private: // Caution!!! // Use a default copy constructor and an assign operator because shallow copies are ok // for this class - int mPtNodePos; int mChildrenPtNodeArrayPos; - int mProbability; int mDicNodeCodePoint; - bool mIsTerminal; - bool mHasChildrenPtNodes; - bool mIsBlacklistedOrNotAWord; + int mWordId; uint16_t mDepth; uint16_t mLeavingDepth; - int mPrevWordsTerminalPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordIds; + size_t mPrevWordCount; }; } // namespace latinime #endif // LATINIME_DIC_NODE_PROPERTIES_H diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h index c19d48eb9..3a54c2599 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_scoring.h @@ -103,8 +103,10 @@ class DicNodeStateScoring { return getCompoundDistance(1.0f); } - float getCompoundDistance(const float languageWeight) const { - return mSpatialDistance + mLanguageDistance * languageWeight; + float getCompoundDistance( + const float weightOfLangModelVsSpatialModel) const { + return mSpatialDistance + + mLanguageDistance * weightOfLangModelVsSpatialModel; } float getNormalizedCompoundDistance() const { diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index d62573970..5c9a1392e 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -19,15 +19,16 @@ #include "suggest/core/dictionary/dictionary.h" #include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dictionary/dictionary_utils.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" -#include "suggest/core/session/prev_words_info.h" #include "suggest/core/suggest.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" #include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" +#include "utils/int_array_view.h" #include "utils/log_utils.h" #include "utils/time_keeper.h" @@ -45,88 +46,87 @@ Dictionary::Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::Structu void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, const PrevWordsInfo *const prevWordsInfo, - const SuggestOptions *const suggestOptions, const float languageWeight, + int inputSize, const NgramContext *const ngramContext, + const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults) const { TimeKeeper::setCurrentTime(); - traverseSession->init(this, prevWordsInfo, suggestOptions); + traverseSession->init(this, ngramContext, suggestOptions); const auto &suggest = suggestOptions->isGesture() ? mGestureSuggest : mTypingSuggest; suggest->getSuggestions(proximityInfo, traverseSession, xcoordinates, ycoordinates, times, pointerIds, inputCodePoints, inputSize, - languageWeight, outSuggestionResults); - if (DEBUG_DICT) { - outSuggestionResults->dumpSuggestions(); - } + weightOfLangModelVsSpatialModel, outSuggestionResults); } Dictionary::NgramListenerForPrediction::NgramListenerForPrediction( - const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const suggestionResults, + const NgramContext *const ngramContext, const WordIdArrayView prevWordIds, + SuggestionResults *const suggestionResults, const DictionaryStructureWithBufferPolicy *const dictStructurePolicy) - : mPrevWordsInfo(prevWordsInfo), mSuggestionResults(suggestionResults), - mDictStructurePolicy(dictStructurePolicy) {} + : mNgramContext(ngramContext), mPrevWordIds(prevWordIds), + mSuggestionResults(suggestionResults), mDictStructurePolicy(dictStructurePolicy) {} void Dictionary::NgramListenerForPrediction::onVisitEntry(const int ngramProbability, - const int targetPtNodePos) { - if (targetPtNodePos == NOT_A_DICT_POS) { + const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { return; } - if (mPrevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */) + if (mNgramContext->isNthPrevWordBeginningOfSentence(1 /* n */) && ngramProbability == NOT_A_PROBABILITY) { return; } int targetWordCodePoints[MAX_WORD_LENGTH]; - int unigramProbability = 0; - const int codePointCount = mDictStructurePolicy-> - getCodePointsAndProbabilityAndReturnCodePointCount(targetPtNodePos, - MAX_WORD_LENGTH, targetWordCodePoints, &unigramProbability); + const int codePointCount = mDictStructurePolicy->getCodePointsAndReturnCodePointCount( + targetWordId, MAX_WORD_LENGTH, targetWordCodePoints); if (codePointCount <= 0) { return; } - const int probability = mDictStructurePolicy->getProbability( - unigramProbability, ngramProbability); - mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, probability); + const WordAttributes wordAttributes = mDictStructurePolicy->getWordAttributesInContext( + mPrevWordIds, targetWordId, nullptr /* multiBigramMap */); + if (wordAttributes.getProbability() == NOT_A_PROBABILITY) { + return; + } + mSuggestionResults->addPrediction(targetWordCodePoints, codePointCount, + wordAttributes.getProbability()); } -void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, +void Dictionary::getPredictions(const NgramContext *const ngramContext, SuggestionResults *const outSuggestionResults) const { TimeKeeper::setCurrentTime(); - NgramListenerForPrediction listener(prevWordsInfo, outSuggestionResults, - mDictionaryStructureWithBufferPolicy.get()); - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos( - mDictionaryStructureWithBufferPolicy.get(), prevWordsPtNodePos, + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds( + mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray, true /* tryLowerCaseSearch */); - mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordsPtNodePos, &listener); + NgramListenerForPrediction listener(ngramContext, prevWordIds, outSuggestionResults, + mDictionaryStructureWithBufferPolicy.get()); + mDictionaryStructureWithBufferPolicy->iterateNgramEntries(prevWordIds, &listener); } -int Dictionary::getProbability(const int *word, int length) const { - return getNgramProbability(nullptr /* prevWordsInfo */, word, length); +int Dictionary::getProbability(const CodePointArrayView codePoints) const { + return getNgramProbability(nullptr /* ngramContext */, codePoints); } -int Dictionary::getMaxProbabilityOfExactMatches(const int *word, int length) const { +int Dictionary::getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const { TimeKeeper::setCurrentTime(); return DictionaryUtils::getMaxProbabilityOfExactMatches( - mDictionaryStructureWithBufferPolicy.get(), word, length); + mDictionaryStructureWithBufferPolicy.get(), codePoints); } -int Dictionary::getNgramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, - int length) const { +int Dictionary::getNgramProbability(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) const { TimeKeeper::setCurrentTime(); - int nextWordPos = mDictionaryStructureWithBufferPolicy->getTerminalPtNodePositionOfWord(word, - length, false /* forceLowerCaseSearch */); - if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; - if (!prevWordsInfo) { - return getDictionaryStructurePolicy()->getProbabilityOfPtNode( - nullptr /* prevWordsPtNodePos */, nextWordPos); + const int wordId = mDictionaryStructureWithBufferPolicy->getWordId(codePoints, + false /* forceLowerCaseSearch */); + if (wordId == NOT_A_WORD_ID) return NOT_A_PROBABILITY; + if (!ngramContext) { + return getDictionaryStructurePolicy()->getProbabilityOfWord(WordIdArrayView(), wordId); } - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos( - mDictionaryStructureWithBufferPolicy.get(), prevWordsPtNodePos, + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = ngramContext->getPrevWordIds( + mDictionaryStructureWithBufferPolicy.get(), &prevWordIdArray, true /* tryLowerCaseSearch */); - return getDictionaryStructurePolicy()->getProbabilityOfPtNode(prevWordsPtNodePos, nextWordPos); + return getDictionaryStructurePolicy()->getProbabilityOfWord(prevWordIds, wordId); } -bool Dictionary::addUnigramEntry(const int *const word, const int length, +bool Dictionary::addUnigramEntry(const CodePointArrayView codePoints, const UnigramProperty *const unigramProperty) { if (unigramProperty->representsBeginningOfSentence() && !mDictionaryStructureWithBufferPolicy->getHeaderStructurePolicy() @@ -135,24 +135,31 @@ bool Dictionary::addUnigramEntry(const int *const word, const int length, return false; } TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy->addUnigramEntry(word, length, unigramProperty); + return mDictionaryStructureWithBufferPolicy->addUnigramEntry(codePoints, unigramProperty); +} + +bool Dictionary::removeUnigramEntry(const CodePointArrayView codePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints); } -bool Dictionary::removeUnigramEntry(const int *const codePoints, const int codePointCount) { +bool Dictionary::addNgramEntry(const NgramProperty *const ngramProperty) { TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy->removeUnigramEntry(codePoints, codePointCount); + return mDictionaryStructureWithBufferPolicy->addNgramEntry(ngramProperty); } -bool Dictionary::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty) { +bool Dictionary::removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) { TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy->addNgramEntry(prevWordsInfo, bigramProperty); + return mDictionaryStructureWithBufferPolicy->removeNgramEntry(ngramContext, codePoints); } -bool Dictionary::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const int *const word, const int length) { +bool Dictionary::updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView codePoints, const bool isValidWord, + const HistoricalInfo historicalInfo) { TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy->removeNgramEntry(prevWordsInfo, word, length); + return mDictionaryStructureWithBufferPolicy->updateEntriesForWordWithNgramContext(ngramContext, + codePoints, isValidWord, historicalInfo); } bool Dictionary::flush(const char *const filePath) { @@ -177,11 +184,9 @@ void Dictionary::getProperty(const char *const query, const int queryLength, cha maxResultLength); } -const WordProperty Dictionary::getWordProperty(const int *const codePoints, - const int codePointCount) { +const WordProperty Dictionary::getWordProperty(const CodePointArrayView codePoints) { TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy->getWordProperty( - codePoints, codePointCount); + return mDictionaryStructureWithBufferPolicy->getWordProperty(codePoints); } int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints, diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 732d3b199..9e224ebfb 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -21,17 +21,19 @@ #include "defines.h" #include "jni.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/property/word_property.h" #include "suggest/core/suggest_interface.h" +#include "utils/int_array_view.h" namespace latinime { class DictionaryStructureWithBufferPolicy; class DicTraverseSession; -class PrevWordsInfo; +class NgramContext; class ProximityInfo; class SuggestionResults; class SuggestOptions; @@ -58,36 +60,40 @@ class Dictionary { static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; static const int KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = 0x20000000; + static const int KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION = 0x10000000; Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy); void getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, const PrevWordsInfo *const prevWordsInfo, - const SuggestOptions *const suggestOptions, const float languageWeight, + int inputSize, const NgramContext *const ngramContext, + const SuggestOptions *const suggestOptions, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults) const; - void getPredictions(const PrevWordsInfo *const prevWordsInfo, + void getPredictions(const NgramContext *const ngramContext, SuggestionResults *const outSuggestionResults) const; - int getProbability(const int *word, int length) const; + int getProbability(const CodePointArrayView codePoints) const; - int getMaxProbabilityOfExactMatches(const int *word, int length) const; + int getMaxProbabilityOfExactMatches(const CodePointArrayView codePoints) const; - int getNgramProbability(const PrevWordsInfo *const prevWordsInfo, - const int *word, int length) const; + int getNgramProbability(const NgramContext *const ngramContext, + const CodePointArrayView codePoints) const; - bool addUnigramEntry(const int *const codePoints, const int codePointCount, + bool addUnigramEntry(const CodePointArrayView codePoints, const UnigramProperty *const unigramProperty); - bool removeUnigramEntry(const int *const codePoints, const int codePointCount); + bool removeUnigramEntry(const CodePointArrayView codePoints); - bool addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty); + bool addNgramEntry(const NgramProperty *const ngramProperty); - bool removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, const int *const word, - const int length); + bool removeNgramEntry(const NgramContext *const ngramContext, + const CodePointArrayView codePoints); + + bool updateEntriesForWordWithNgramContext(const NgramContext *const ngramContext, + const CodePointArrayView codePoints, const bool isValidWord, + const HistoricalInfo historicalInfo); bool flush(const char *const filePath); @@ -98,7 +104,7 @@ class Dictionary { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); - const WordProperty getWordProperty(const int *const codePoints, const int codePointCount); + const WordProperty getWordProperty(const CodePointArrayView codePoints); // Method to iterate all words in the dictionary. // The returned token has to be used to get the next word. If token is 0, this method newly @@ -117,15 +123,16 @@ class Dictionary { class NgramListenerForPrediction : public NgramListener { public: - NgramListenerForPrediction(const PrevWordsInfo *const prevWordsInfo, - SuggestionResults *const suggestionResults, + NgramListenerForPrediction(const NgramContext *const ngramContext, + const WordIdArrayView prevWordIds, SuggestionResults *const suggestionResults, const DictionaryStructureWithBufferPolicy *const dictStructurePolicy); - virtual void onVisitEntry(const int ngramProbability, const int targetPtNodePos); + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); private: DISALLOW_IMPLICIT_CONSTRUCTORS(NgramListenerForPrediction); - const PrevWordsInfo *const mPrevWordsInfo; + const NgramContext *const mNgramContext; + const WordIdArrayView mPrevWordIds; SuggestionResults *const mSuggestionResults; const DictionaryStructureWithBufferPolicy *const mDictStructurePolicy; }; diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp index b94966cbe..7de550026 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.cpp @@ -16,39 +16,40 @@ #include "suggest/core/dictionary/dictionary_utils.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" -#include "suggest/core/session/prev_words_info.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "utils/int_array_view.h" namespace latinime { /* static */ int DictionaryUtils::getMaxProbabilityOfExactMatches( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int *const codePoints, const int codePointCount) { + const CodePointArrayView codePoints) { std::vector<DicNode> current; std::vector<DicNode> next; - // No prev words information. - PrevWordsInfo emptyPrevWordsInfo; - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - emptyPrevWordsInfo.getPrevWordsTerminalPtNodePos(dictionaryStructurePolicy, - prevWordsPtNodePos, false /* tryLowerCaseSearch */); + // No ngram context. + NgramContext emptyNgramContext; + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> prevWordIdArray; + const WordIdArrayView prevWordIds = emptyNgramContext.getPrevWordIds( + dictionaryStructurePolicy, &prevWordIdArray, false /* tryLowerCaseSearch */); current.emplace_back(); - DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordsPtNodePos, ¤t.front()); - for (int i = 0; i < codePointCount; ++i) { + DicNodeUtils::initAsRoot(dictionaryStructurePolicy, prevWordIds, ¤t.front()); + for (const int codePoint : codePoints) { // The base-lower input is used to ignore case errors and accent errors. - const int codePoint = CharUtils::toBaseLowerCase(codePoints[i]); + const int baseLowerCodePoint = CharUtils::toBaseLowerCase(codePoint); for (const DicNode &dicNode : current) { - if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == codePoint) { + if (dicNode.isInDigraph() && dicNode.getNodeCodePoint() == baseLowerCodePoint) { next.emplace_back(dicNode); next.back().advanceDigraphIndex(); continue; } - processChildDicNodes(dictionaryStructurePolicy, codePoint, &dicNode, &next); + processChildDicNodes(dictionaryStructurePolicy, baseLowerCodePoint, &dicNode, &next); } current.clear(); current.swap(next); @@ -59,8 +60,11 @@ namespace latinime { if (!dicNode.isTerminalDicNode()) { continue; } + const WordAttributes wordAttributes = + dictionaryStructurePolicy->getWordAttributesInContext(dicNode.getPrevWordIds(), + dicNode.getWordId(), nullptr /* multiBigramMap */); // dicNode can contain case errors, accent errors, intentional omissions or digraphs. - maxProbability = std::max(maxProbability, dicNode.getProbability()); + maxProbability = std::max(maxProbability, wordAttributes.getProbability()); } return maxProbability; } diff --git a/native/jni/src/suggest/core/dictionary/dictionary_utils.h b/native/jni/src/suggest/core/dictionary/dictionary_utils.h index 358ebf674..4dd21c9be 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary_utils.h +++ b/native/jni/src/suggest/core/dictionary/dictionary_utils.h @@ -20,6 +20,7 @@ #include <vector> #include "defines.h" +#include "utils/int_array_view.h" namespace latinime { @@ -30,7 +31,7 @@ class DictionaryUtils { public: static int getMaxProbabilityOfExactMatches( const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, - const int *const codePoints, const int codePointCount); + const CodePointArrayView codePoints); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryUtils); diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp index bb2ce5012..4d68f620f 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -19,7 +19,7 @@ #include <cstdlib> #include "defines.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" #include "utils/char_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp index b6bf7a98c..61093e174 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.cpp @@ -19,17 +19,20 @@ namespace latinime { const ErrorTypeUtils::ErrorType ErrorTypeUtils::NOT_AN_ERROR = 0x0; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_CASE_ERROR = 0x1; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR = 0x2; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x4; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x8; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x10; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x20; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x40; -const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x80; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_CASE = 0x1; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT = 0x2; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT = 0x4; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT = 0x8; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::MATCH_WITH_DIGRAPH = 0x10; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::INTENTIONAL_OMISSION = 0x20; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::EDIT_CORRECTION = 0x40; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::PROXIMITY_CORRECTION = 0x80; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::COMPLETION = 0x100; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::NEW_WORD = 0x200; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH = - NOT_AN_ERROR | MATCH_WITH_CASE_ERROR | MATCH_WITH_ACCENT_ERROR | MATCH_WITH_DIGRAPH; + NOT_AN_ERROR | MATCH_WITH_WRONG_CASE | MATCH_WITH_MISSING_ACCENT | MATCH_WITH_DIGRAPH; +const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_A_PERFECT_MATCH = NOT_AN_ERROR; const ErrorTypeUtils::ErrorType ErrorTypeUtils::ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION = diff --git a/native/jni/src/suggest/core/dictionary/error_type_utils.h b/native/jni/src/suggest/core/dictionary/error_type_utils.h index e3e76b238..75111ba75 100644 --- a/native/jni/src/suggest/core/dictionary/error_type_utils.h +++ b/native/jni/src/suggest/core/dictionary/error_type_utils.h @@ -30,8 +30,10 @@ class ErrorTypeUtils { typedef uint32_t ErrorType; static const ErrorType NOT_AN_ERROR; - static const ErrorType MATCH_WITH_CASE_ERROR; - static const ErrorType MATCH_WITH_ACCENT_ERROR; + static const ErrorType MATCH_WITH_WRONG_CASE; + static const ErrorType MATCH_WITH_MISSING_ACCENT; + static const ErrorType MATCH_WITH_MISSING_EXPLICIT_ACCENT; + static const ErrorType MATCH_WITH_WRONG_ACCENT; static const ErrorType MATCH_WITH_DIGRAPH; // Treat error as an intentional omission when the CorrectionType is omission and the node can // be intentional omission. @@ -51,11 +53,19 @@ class ErrorTypeUtils { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH) == 0; } + static bool isPerfectMatch(const ErrorType containedErrorTypes) { + return (containedErrorTypes & ~ERRORS_TREATED_AS_A_PERFECT_MATCH) == 0; + } + static bool isExactMatchWithIntentionalOmission(const ErrorType containedErrorTypes) { return (containedErrorTypes & ~ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION) == 0; } + static bool isMissingExplicitAccent(const ErrorType errorType) { + return (errorType & MATCH_WITH_MISSING_EXPLICIT_ACCENT) != 0; + } + static bool isEditCorrectionError(const ErrorType errorType) { return (errorType & EDIT_CORRECTION) != 0; } @@ -72,6 +82,7 @@ class ErrorTypeUtils { DISALLOW_IMPLICIT_CONSTRUCTORS(ErrorTypeUtils); static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH; + static const ErrorType ERRORS_TREATED_AS_A_PERFECT_MATCH; static const ErrorType ERRORS_TREATED_AS_AN_EXACT_MATCH_WITH_INTENTIONAL_OMISSION; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/property/bigram_property.h b/native/jni/src/suggest/core/dictionary/property/bigram_property.h deleted file mode 100644 index 343af143c..000000000 --- a/native/jni/src/suggest/core/dictionary/property/bigram_property.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_PROPERTY_H -#define LATINIME_BIGRAM_PROPERTY_H - -#include <vector> - -#include "defines.h" - -namespace latinime { - -// TODO: Change to NgramProperty. -class BigramProperty { - public: - BigramProperty(const std::vector<int> *const targetCodePoints, - const int probability, const int timestamp, const int level, const int count) - : mTargetCodePoints(*targetCodePoints), mProbability(probability), - mTimestamp(timestamp), mLevel(level), mCount(count) {} - - const std::vector<int> *getTargetCodePoints() const { - return &mTargetCodePoints; - } - - int getProbability() const { - return mProbability; - } - - int getTimestamp() const { - return mTimestamp; - } - - int getLevel() const { - return mLevel; - } - - int getCount() const { - return mCount; - } - - private: - // Default copy constructor and assign operator are used for using in std::vector. - DISALLOW_DEFAULT_CONSTRUCTOR(BigramProperty); - - // TODO: Make members const. - std::vector<int> mTargetCodePoints; - int mProbability; - int mTimestamp; - int mLevel; - int mCount; -}; -} // namespace latinime -#endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/property/unigram_property.h b/native/jni/src/suggest/core/dictionary/property/unigram_property.h deleted file mode 100644 index 902eb000f..000000000 --- a/native/jni/src/suggest/core/dictionary/property/unigram_property.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_UNIGRAM_PROPERTY_H -#define LATINIME_UNIGRAM_PROPERTY_H - -#include <vector> - -#include "defines.h" - -namespace latinime { - -class UnigramProperty { - public: - class ShortcutProperty { - public: - ShortcutProperty(const std::vector<int> *const targetCodePoints, const int probability) - : mTargetCodePoints(*targetCodePoints), mProbability(probability) {} - - const std::vector<int> *getTargetCodePoints() const { - return &mTargetCodePoints; - } - - int getProbability() const { - return mProbability; - } - - private: - // Default copy constructor and assign operator are used for using in std::vector. - DISALLOW_DEFAULT_CONSTRUCTOR(ShortcutProperty); - - // TODO: Make members const. - std::vector<int> mTargetCodePoints; - int mProbability; - }; - - UnigramProperty() - : mRepresentsBeginningOfSentence(false), mIsNotAWord(false), mIsBlacklisted(false), - mProbability(NOT_A_PROBABILITY), mTimestamp(NOT_A_TIMESTAMP), mLevel(0), mCount(0), - mShortcuts() {} - - UnigramProperty(const bool representsBeginningOfSentence, const bool isNotAWord, - const bool isBlacklisted, const int probability, const int timestamp, const int level, - const int count, const std::vector<ShortcutProperty> *const shortcuts) - : mRepresentsBeginningOfSentence(representsBeginningOfSentence), - mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mProbability(probability), - mTimestamp(timestamp), mLevel(level), mCount(count), mShortcuts(*shortcuts) {} - - bool representsBeginningOfSentence() const { - return mRepresentsBeginningOfSentence; - } - - bool isNotAWord() const { - return mIsNotAWord; - } - - bool isBlacklisted() const { - return mIsBlacklisted; - } - - bool hasShortcuts() const { - return !mShortcuts.empty(); - } - - int getProbability() const { - return mProbability; - } - - int getTimestamp() const { - return mTimestamp; - } - - int getLevel() const { - return mLevel; - } - - int getCount() const { - return mCount; - } - - const std::vector<ShortcutProperty> &getShortcuts() const { - return mShortcuts; - } - - private: - // Default copy constructor is used for using as a return value. - DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); - - // TODO: Make members const. - bool mRepresentsBeginningOfSentence; - bool mIsNotAWord; - bool mIsBlacklisted; - int mProbability; - // Historical information - int mTimestamp; - int mLevel; - int mCount; - std::vector<ShortcutProperty> mShortcuts; -}; -} // namespace latinime -#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp deleted file mode 100644 index 5bdd5606b..000000000 --- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/property/word_property.h" - -#include "utils/jni_data_utils.h" - -namespace latinime { - -void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, - jobject outBigramProbabilities, jobject outShortcutTargets, - jobject outShortcutProbabilities) const { - JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, - MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(), - false /* needsNullTermination */); - jboolean flags[] = {mUnigramProperty.isNotAWord(), mUnigramProperty.isBlacklisted(), - !mBigrams.empty(), mUnigramProperty.hasShortcuts(), - mUnigramProperty.representsBeginningOfSentence()}; - env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); - int probabilityInfo[] = {mUnigramProperty.getProbability(), mUnigramProperty.getTimestamp(), - mUnigramProperty.getLevel(), mUnigramProperty.getCount()}; - env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), - probabilityInfo); - - jclass integerClass = env->FindClass("java/lang/Integer"); - jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V"); - jclass arrayListClass = env->FindClass("java/util/ArrayList"); - jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); - - // Output bigrams. - for (const auto &bigramProperty : mBigrams) { - const std::vector<int> *const word1CodePoints = bigramProperty.getTargetCodePoints(); - jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); - JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */, - word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(), - false /* needsNullTermination */); - env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); - env->DeleteLocalRef(bigramWord1CodePointArray); - - int bigramProbabilityInfo[] = {bigramProperty.getProbability(), - bigramProperty.getTimestamp(), bigramProperty.getLevel(), - bigramProperty.getCount()}; - jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); - env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, - NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); - env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); - env->DeleteLocalRef(bigramProbabilityInfoArray); - } - - // Output shortcuts. - for (const auto &shortcut : mUnigramProperty.getShortcuts()) { - const std::vector<int> *const targetCodePoints = shortcut.getTargetCodePoints(); - jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); - env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, - targetCodePoints->size(), targetCodePoints->data()); - JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */, - targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(), - false /* needsNullTermination */); - env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); - env->DeleteLocalRef(shortcutTargetCodePointArray); - jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, - shortcut.getProbability()); - env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability); - env->DeleteLocalRef(integerProbability); - } - env->DeleteLocalRef(integerClass); - env->DeleteLocalRef(arrayListClass); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp b/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp index 34b8b37b0..8b39f7da5 100644 --- a/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp +++ b/native/jni/src/suggest/core/layout/additional_proximity_chars.cpp @@ -19,7 +19,7 @@ namespace latinime { // TODO: Stop using hardcoded additional proximity characters. // TODO: Have proximity character informations in each language's binary dictionary. -const char *AdditionalProximityChars::LOCALE_EN_US = "en"; +const int AdditionalProximityChars::LOCALE_EN_US[LOCALE_EN_US_SIZE] = { 'e', 'n' }; const int AdditionalProximityChars::EN_US_ADDITIONAL_A[EN_US_ADDITIONAL_A_SIZE] = { 'e', 'i', 'o', 'u' diff --git a/native/jni/src/suggest/core/layout/additional_proximity_chars.h b/native/jni/src/suggest/core/layout/additional_proximity_chars.h index a88fd6cea..2260be9bd 100644 --- a/native/jni/src/suggest/core/layout/additional_proximity_chars.h +++ b/native/jni/src/suggest/core/layout/additional_proximity_chars.h @@ -18,6 +18,7 @@ #define LATINIME_ADDITIONAL_PROXIMITY_CHARS_H #include <cstring> +#include <vector> #include "defines.h" @@ -26,7 +27,8 @@ namespace latinime { class AdditionalProximityChars { private: DISALLOW_IMPLICIT_CONSTRUCTORS(AdditionalProximityChars); - static const char *LOCALE_EN_US; + static const int LOCALE_EN_US_SIZE = 2; + static const int LOCALE_EN_US[LOCALE_EN_US_SIZE]; static const int EN_US_ADDITIONAL_A_SIZE = 4; static const int EN_US_ADDITIONAL_A[]; static const int EN_US_ADDITIONAL_E_SIZE = 4; @@ -38,15 +40,22 @@ class AdditionalProximityChars { static const int EN_US_ADDITIONAL_U_SIZE = 4; static const int EN_US_ADDITIONAL_U[]; - AK_FORCE_INLINE static bool isEnLocale(const char *localeStr) { - const size_t LOCALE_EN_US_SIZE = strlen(LOCALE_EN_US); - return localeStr && strlen(localeStr) >= LOCALE_EN_US_SIZE - && strncmp(localeStr, LOCALE_EN_US, LOCALE_EN_US_SIZE) == 0; + AK_FORCE_INLINE static bool isEnLocale(const std::vector<int> *locale) { + const int NCHARS = NELEMS(LOCALE_EN_US); + if (locale->size() < NCHARS) { + return false; + } + for (int i = 0; i < NCHARS; ++i) { + if ((*locale)[i] != LOCALE_EN_US[i]) { + return false; + } + } + return true; } public: - static int getAdditionalCharsSize(const char *const localeStr, const int c) { - if (!isEnLocale(localeStr)) { + static int getAdditionalCharsSize(const std::vector<int> *locale, const int c) { + if (!isEnLocale(locale)) { return 0; } switch (c) { @@ -65,8 +74,8 @@ class AdditionalProximityChars { } } - static const int *getAdditionalChars(const char *const localeStr, const int c) { - if (!isEnLocale(localeStr)) { + static const int *getAdditionalChars(const std::vector<int> *locale, const int c) { + if (!isEnLocale(locale)) { return 0; } switch (c) { diff --git a/native/jni/src/suggest/core/layout/geometry_utils.h b/native/jni/src/suggest/core/layout/geometry_utils.h index b667df68f..000fcd4a1 100644 --- a/native/jni/src/suggest/core/layout/geometry_utils.h +++ b/native/jni/src/suggest/core/layout/geometry_utils.h @@ -38,13 +38,15 @@ class GeometryUtils { } static AK_FORCE_INLINE float getAngleDiff(const float a1, const float a2) { - const float deltaA = fabsf(a1 - a2); - const float diff = ROUND_FLOAT_10000(deltaA); - if (diff > M_PI_F) { - const float normalizedDiff = 2.0f * M_PI_F - diff; - return ROUND_FLOAT_10000(normalizedDiff); + static const float M_2PI_F = M_PI * 2.0f; + float delta = fabsf(a1 - a2); + if (delta > M_2PI_F) { + delta -= (M_2PI_F * static_cast<int>(delta / M_2PI_F)); } - return diff; + if (delta > M_PI_F) { + delta = M_2PI_F - delta; + } + return ROUND_FLOAT_10000(delta); } static AK_FORCE_INLINE int getDistanceInt(const int x1, const int y1, const int x2, diff --git a/native/jni/src/suggest/core/layout/proximity_info.cpp b/native/jni/src/suggest/core/layout/proximity_info.cpp index 4c75a188e..933a5e145 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info.cpp @@ -49,13 +49,13 @@ static AK_FORCE_INLINE void safeGetOrFillZeroFloatArrayRegion(JNIEnv *env, jfloa } } -ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, - const int keyboardWidth, const int keyboardHeight, const int gridWidth, - const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight, - const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, - const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, - const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, - const jfloatArray sweetSpotCenterYs, const jfloatArray sweetSpotRadii) +ProximityInfo::ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight, + const int gridWidth, const int gridHeight, const int mostCommonKeyWidth, + const int mostCommonKeyHeight, const jintArray proximityChars, const int keyCount, + const jintArray keyXCoordinates, const jintArray keyYCoordinates, + const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, + const jfloatArray sweetSpotCenterXs, const jfloatArray sweetSpotCenterYs, + const jfloatArray sweetSpotRadii) : GRID_WIDTH(gridWidth), GRID_HEIGHT(gridHeight), MOST_COMMON_KEY_WIDTH(mostCommonKeyWidth), MOST_COMMON_KEY_WIDTH_SQUARE(mostCommonKeyWidth * mostCommonKeyWidth), NORMALIZED_SQUARED_MOST_COMMON_KEY_HYPOTENUSE(1.0f + @@ -82,13 +82,6 @@ ProximityInfo::ProximityInfo(JNIEnv *env, const jstring localeJStr, if (DEBUG_PROXIMITY_INFO) { AKLOGI("Create proximity info array %d", proximityCharsLength); } - const jsize localeCStrUtf8Length = env->GetStringUTFLength(localeJStr); - if (localeCStrUtf8Length >= MAX_LOCALE_STRING_LENGTH) { - AKLOGI("Locale string length too long: length=%d", localeCStrUtf8Length); - ASSERT(false); - } - memset(mLocaleStr, 0, sizeof(mLocaleStr)); - env->GetStringUTFRegion(localeJStr, 0, env->GetStringLength(localeJStr), mLocaleStr); safeGetOrFillZeroIntArrayRegion(env, proximityChars, proximityCharsLength, mProximityCharsArray); safeGetOrFillZeroIntArrayRegion(env, keyXCoordinates, KEY_COUNT, mKeyXCoordinates); diff --git a/native/jni/src/suggest/core/layout/proximity_info.h b/native/jni/src/suggest/core/layout/proximity_info.h index d4e453736..f7c907697 100644 --- a/native/jni/src/suggest/core/layout/proximity_info.h +++ b/native/jni/src/suggest/core/layout/proximity_info.h @@ -18,6 +18,7 @@ #define LATINIME_PROXIMITY_INFO_H #include <unordered_map> +#include <vector> #include "defines.h" #include "jni.h" @@ -27,9 +28,9 @@ namespace latinime { class ProximityInfo { public: - ProximityInfo(JNIEnv *env, const jstring localeJStr, - const int keyboardWidth, const int keyboardHeight, const int gridWidth, - const int gridHeight, const int mostCommonKeyWidth, const int mostCommonKeyHeight, + ProximityInfo(JNIEnv *env, const int keyboardWidth, const int keyboardHeight, + const int gridWidth, const int gridHeight, + const int mostCommonKeyWidth, const int mostCommonKeyHeight, const jintArray proximityChars, const int keyCount, const jintArray keyXCoordinates, const jintArray keyYCoordinates, const jintArray keyWidths, const jintArray keyHeights, const jintArray keyCharCodes, const jfloatArray sweetSpotCenterXs, @@ -71,11 +72,11 @@ class ProximityInfo { AK_FORCE_INLINE void initializeProximities(const int *const inputCodes, const int *const inputXCoordinates, const int *const inputYCoordinates, - const int inputSize, int *allInputCodes) const { + const int inputSize, int *allInputCodes, const std::vector<int> *locale) const { ProximityInfoUtils::initializeProximities(inputCodes, inputXCoordinates, inputYCoordinates, inputSize, mKeyXCoordinates, mKeyYCoordinates, mKeyWidths, mKeyHeights, mProximityCharsArray, CELL_HEIGHT, CELL_WIDTH, GRID_WIDTH, MOST_COMMON_KEY_WIDTH, - KEY_COUNT, mLocaleStr, &mLowerCodePointToKeyMap, allInputCodes); + KEY_COUNT, locale, &mLowerCodePointToKeyMap, allInputCodes); } AK_FORCE_INLINE int getKeyIndexOf(const int c) const { @@ -103,9 +104,6 @@ class ProximityInfo { const int KEYBOARD_HEIGHT; const float KEYBOARD_HYPOTENUSE; const bool HAS_TOUCH_POSITION_CORRECTION_DATA; - // Assuming locale strings such as en_US, sr-Latn etc. - static const int MAX_LOCALE_STRING_LENGTH = 10; - char mLocaleStr[MAX_LOCALE_STRING_LENGTH]; int *mProximityCharsArray; int mKeyXCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; int mKeyYCoordinates[MAX_KEY_COUNT_IN_A_KEYBOARD]; diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.cpp b/native/jni/src/suggest/core/layout/proximity_info_state.cpp index 91469e26d..d43a0026a 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.cpp +++ b/native/jni/src/suggest/core/layout/proximity_info_state.cpp @@ -42,7 +42,7 @@ int ProximityInfoState::getPrimaryOriginalCodePointAt(const int index) const { void ProximityInfoState::initInputParams(const int pointerId, const float maxPointToKeyLength, const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, const int *const xCoordinates, const int *const yCoordinates, const int *const times, - const int *const pointerIds, const bool isGeometric) { + const int *const pointerIds, const bool isGeometric, const std::vector<int> *locale) { ASSERT(isGeometric || (inputSize < MAX_WORD_LENGTH)); mIsContinuousSuggestionPossible = (mHasBeenUpdatedByGeometricInput != isGeometric) ? false : ProximityInfoStateUtils::checkAndReturnIsContinuousSuggestionPossible( @@ -66,7 +66,7 @@ void ProximityInfoState::initInputParams(const int pointerId, const float maxPoi if (!isGeometric && pointerId == 0) { mProximityInfo->initializeProximities(inputCodes, xCoordinates, yCoordinates, - inputSize, mInputProximities); + inputSize, mInputProximities, locale); } /////////////////////// diff --git a/native/jni/src/suggest/core/layout/proximity_info_state.h b/native/jni/src/suggest/core/layout/proximity_info_state.h index e6180fe17..a2d663544 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_state.h +++ b/native/jni/src/suggest/core/layout/proximity_info_state.h @@ -37,7 +37,8 @@ class ProximityInfoState { void initInputParams(const int pointerId, const float maxPointToKeyLength, const ProximityInfo *proximityInfo, const int *const inputCodes, const int inputSize, const int *xCoordinates, const int *yCoordinates, - const int *const times, const int *const pointerIds, const bool isGeometric); + const int *const times, const int *const pointerIds, const bool isGeometric, + const std::vector<int> *locale); ///////////////////////////////////////// // Defined here // diff --git a/native/jni/src/suggest/core/layout/proximity_info_utils.h b/native/jni/src/suggest/core/layout/proximity_info_utils.h index 178aada2d..79d0615b8 100644 --- a/native/jni/src/suggest/core/layout/proximity_info_utils.h +++ b/native/jni/src/suggest/core/layout/proximity_info_utils.h @@ -19,6 +19,7 @@ #include <cmath> #include <unordered_map> +#include <vector> #include "defines.h" #include "suggest/core/layout/additional_proximity_chars.h" @@ -51,7 +52,7 @@ class ProximityInfoUtils { const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, const int *const proximityCharsArray, const int cellHeight, const int cellWidth, const int gridWidth, const int mostCommonKeyWidth, const int keyCount, - const char *const localeStr, + const std::vector<int> *locale, const std::unordered_map<int, int> *const codeToKeyMap, int *inputProximities) { // Initialize // - mInputCodes @@ -64,7 +65,7 @@ class ProximityInfoUtils { int *proximities = &inputProximities[i * MAX_PROXIMITY_CHARS_SIZE]; calculateProximities(keyXCoordinates, keyYCoordinates, keyWidths, keyHeights, proximityCharsArray, cellHeight, cellWidth, gridWidth, mostCommonKeyWidth, - keyCount, x, y, primaryKey, localeStr, codeToKeyMap, proximities); + keyCount, x, y, primaryKey, locale, codeToKeyMap, proximities); } if (DEBUG_PROXIMITY_CHARS) { @@ -143,7 +144,7 @@ class ProximityInfoUtils { const int *const keyYCoordinates, const int *const keyWidths, const int *keyHeights, const int *const proximityCharsArray, const int cellHeight, const int cellWidth, const int gridWidth, const int mostCommonKeyWidth, const int keyCount, - const int x, const int y, const int primaryKey, const char *const localeStr, + const int x, const int y, const int primaryKey, const std::vector<int> *locale, const std::unordered_map<int, int> *const codeToKeyMap, int *proximities) { const int mostCommonKeyWidthSquare = mostCommonKeyWidth * mostCommonKeyWidth; int insertPos = 0; @@ -177,7 +178,7 @@ class ProximityInfoUtils { } } const int additionalProximitySize = - AdditionalProximityChars::getAdditionalCharsSize(localeStr, primaryKey); + AdditionalProximityChars::getAdditionalCharsSize(locale, primaryKey); if (additionalProximitySize > 0) { proximities[insertPos++] = ADDITIONAL_PROXIMITY_CHAR_DELIMITER_CODE; if (insertPos >= MAX_PROXIMITY_CHARS_SIZE) { @@ -188,7 +189,7 @@ class ProximityInfoUtils { } const int *additionalProximityChars = - AdditionalProximityChars::getAdditionalChars(localeStr, primaryKey); + AdditionalProximityChars::getAdditionalChars(locale, primaryKey); for (int j = 0; j < additionalProximitySize; ++j) { const int ac = additionalProximityChars[j]; int k = 0; diff --git a/native/jni/src/suggest/core/policy/scoring.h b/native/jni/src/suggest/core/policy/scoring.h index 9e75cace4..b9dda83ad 100644 --- a/native/jni/src/suggest/core/policy/scoring.h +++ b/native/jni/src/suggest/core/policy/scoring.h @@ -30,11 +30,13 @@ class Scoring { public: virtual int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const = 0; + const bool boostExactMatches, const bool hasProbabilityZero) const = 0; virtual void getMostProbableString(const DicTraverseSession *const traverseSession, - const float languageWeight, SuggestionResults *const outSuggestionResults) const = 0; - virtual float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, - DicNode *const terminals, const int size) const = 0; + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const = 0; + virtual float getAdjustedWeightOfLangModelVsSpatialModel( + DicTraverseSession *const traverseSession, DicNode *const terminals, + const int size) const = 0; virtual float getDoubleLetterDemotionDistanceCost( const DicNode *const terminalDicNode) const = 0; virtual bool autoCorrectsToMultiWordSuggestionIfTop() const = 0; diff --git a/native/jni/src/suggest/core/policy/traversal.h b/native/jni/src/suggest/core/policy/traversal.h index 8ddaa0514..5b6616d9a 100644 --- a/native/jni/src/suggest/core/policy/traversal.h +++ b/native/jni/src/suggest/core/policy/traversal.h @@ -44,11 +44,12 @@ class Traversal { virtual bool needsToTraverseAllUserInput() const = 0; virtual float getMaxSpatialDistance() const = 0; virtual int getDefaultExpandDicNodeSize() const = 0; - virtual int getMaxCacheSize(const int inputSize) const = 0; + virtual int getMaxCacheSize(const int inputSize, const float weightForLocale) const = 0; virtual int getTerminalCacheSize() const = 0; virtual bool isPossibleOmissionChildNode(const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; - virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode) const = 0; + virtual bool isGoodToTraverseNextWord(const DicNode *const dicNode, + const int probability) const = 0; protected: Traversal() {} diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp index c202b81fe..450203d98 100644 --- a/native/jni/src/suggest/core/policy/weighting.cpp +++ b/native/jni/src/suggest/core/policy/weighting.cpp @@ -110,12 +110,16 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n return weighting->getOmissionCost(parentDicNode, dicNode); case CT_ADDITIONAL_PROXIMITY: // only used for typing - return weighting->getAdditionalProximityCost(); + // TODO: Quit calling getMatchedCost(). + return weighting->getAdditionalProximityCost() + + weighting->getMatchedCost(traverseSession, dicNode, inputStateG); case CT_SUBSTITUTION: // only used for typing - return weighting->getSubstitutionCost(); + // TODO: Quit calling getMatchedCost(). + return weighting->getSubstitutionCost() + + weighting->getMatchedCost(traverseSession, dicNode, inputStateG); case CT_NEW_WORD_SPACE_OMISSION: - return weighting->getNewWordSpatialCost(traverseSession, dicNode, inputStateG); + return weighting->getSpaceOmissionCost(traverseSession, dicNode, inputStateG); case CT_MATCH: return weighting->getMatchedCost(traverseSession, dicNode, inputStateG); case CT_COMPLETION: @@ -176,9 +180,9 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n case CT_OMISSION: return 0; case CT_ADDITIONAL_PROXIMITY: - return 0; /* 0 because CT_MATCH will be called */ + return 1; case CT_SUBSTITUTION: - return 0; /* 0 because CT_MATCH will be called */ + return 1; case CT_NEW_WORD_SPACE_OMISSION: return 0; case CT_MATCH: diff --git a/native/jni/src/suggest/core/policy/weighting.h b/native/jni/src/suggest/core/policy/weighting.h index bd6b3cf41..863c4eabe 100644 --- a/native/jni/src/suggest/core/policy/weighting.h +++ b/native/jni/src/suggest/core/policy/weighting.h @@ -57,7 +57,7 @@ class Weighting { const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0; - virtual float getNewWordSpatialCost(const DicTraverseSession *const traverseSession, + virtual float getSpaceOmissionCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, DicNode_InputStateG *const inputStateG) const = 0; virtual float getNewWordBigramLanguageCost( diff --git a/native/jni/src/suggest/core/result/suggestion_results.cpp b/native/jni/src/suggest/core/result/suggestion_results.cpp index 4c10bd08a..3756d1092 100644 --- a/native/jni/src/suggest/core/result/suggestion_results.cpp +++ b/native/jni/src/suggest/core/result/suggestion_results.cpp @@ -23,7 +23,7 @@ namespace latinime { void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, jintArray outputCodePointsArray, jintArray outScoresArray, jintArray outSpaceIndicesArray, jintArray outTypesArray, jintArray outAutoCommitFirstWordConfidenceArray, - jfloatArray outLanguageWeight) { + jfloatArray outWeightOfLangModelVsSpatialModel) { int outputIndex = 0; while (!mSuggestedWords.empty()) { const SuggestedWord &suggestedWord = mSuggestedWords.top(); @@ -44,7 +44,8 @@ void SuggestionResults::outputSuggestions(JNIEnv *env, jintArray outSuggestionCo mSuggestedWords.pop(); } JniDataUtils::putIntToArray(env, outSuggestionCount, 0 /* index */, outputIndex); - JniDataUtils::putFloatToArray(env, outLanguageWeight, 0 /* index */, mLanguageWeight); + JniDataUtils::putFloatToArray(env, outWeightOfLangModelVsSpatialModel, 0 /* index */, + mWeightOfLangModelVsSpatialModel); } void SuggestionResults::addPrediction(const int *const codePoints, const int codePointCount, @@ -89,7 +90,7 @@ void SuggestionResults::getSortedScores(int *const outScores) const { } void SuggestionResults::dumpSuggestions() const { - AKLOGE("language weight: %f", mLanguageWeight); + AKLOGE("weight of language model vs spatial model: %f", mWeightOfLangModelVsSpatialModel); std::vector<SuggestedWord> suggestedWords; auto copyOfSuggestedWords = mSuggestedWords; while (!copyOfSuggestedWords.empty()) { diff --git a/native/jni/src/suggest/core/result/suggestion_results.h b/native/jni/src/suggest/core/result/suggestion_results.h index 8e845e2d3..738c78a9f 100644 --- a/native/jni/src/suggest/core/result/suggestion_results.h +++ b/native/jni/src/suggest/core/result/suggestion_results.h @@ -29,13 +29,15 @@ namespace latinime { class SuggestionResults { public: explicit SuggestionResults(const int maxSuggestionCount) - : mMaxSuggestionCount(maxSuggestionCount), mLanguageWeight(NOT_A_LANGUAGE_WEIGHT), + : mMaxSuggestionCount(maxSuggestionCount), + mWeightOfLangModelVsSpatialModel(NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL), mSuggestedWords() {} // Returns suggestion count. void outputSuggestions(JNIEnv *env, jintArray outSuggestionCount, jintArray outCodePointsArray, jintArray outScoresArray, jintArray outSpaceIndicesArray, jintArray outTypesArray, - jintArray outAutoCommitFirstWordConfidenceArray, jfloatArray outLanguageWeight); + jintArray outAutoCommitFirstWordConfidenceArray, + jfloatArray outWeightOfLangModelVsSpatialModel); void addPrediction(const int *const codePoints, const int codePointCount, const int score); void addSuggestion(const int *const codePoints, const int codePointCount, const int score, const int type, const int indexToPartialCommit, @@ -43,8 +45,8 @@ class SuggestionResults { void getSortedScores(int *const outScores) const; void dumpSuggestions() const; - void setLanguageWeight(const float languageWeight) { - mLanguageWeight = languageWeight; + void setWeightOfLangModelVsSpatialModel(const float weightOfLangModelVsSpatialModel) { + mWeightOfLangModelVsSpatialModel = weightOfLangModelVsSpatialModel; } int getSuggestionCount() const { @@ -55,7 +57,7 @@ class SuggestionResults { DISALLOW_IMPLICIT_CONSTRUCTORS(SuggestionResults); const int mMaxSuggestionCount; - float mLanguageWeight; + float mWeightOfLangModelVsSpatialModel; std::priority_queue< SuggestedWord, std::vector<SuggestedWord>, SuggestedWord::Comparator> mSuggestedWords; }; diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp index 0b99b75ec..7c37241de 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp @@ -19,9 +19,9 @@ #include <algorithm> #include <vector> +#include "dictionary/utils/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_utils.h" -#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dictionary/error_type_utils.h" #include "suggest/core/policy/scoring.h" #include "suggest/core/result/suggestion_results.h" @@ -34,7 +34,8 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; /* static */ void SuggestionsOutputUtils::outputSuggestions( const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, - const float languageWeight, SuggestionResults *const outSuggestionResults) { + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) { #if DEBUG_EVALUATE_MOST_PROBABLE_STRING const int terminalSize = 0; #else @@ -44,12 +45,15 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; for (int index = terminalSize - 1; index >= 0; --index) { traverseSession->getDicTraverseCache()->popTerminal(&terminals[index]); } - // Compute a language weight when an invalid language weight is passed. - // NOT_A_LANGUAGE_WEIGHT (-1) is assumed as an invalid language weight. - const float languageWeightToOutputSuggestions = (languageWeight < 0.0f) ? - scoringPolicy->getAdjustedLanguageWeight( - traverseSession, terminals.data(), terminalSize) : languageWeight; - outSuggestionResults->setLanguageWeight(languageWeightToOutputSuggestions); + // Compute a weight of language model when an invalid weight is passed. + // NOT_A_WEIGHT_OF_LANG_MODEL_VS_SPATIAL_MODEL (-1) is taken as an invalid value. + const float weightOfLangModelVsSpatialModelToOutputSuggestions = + (weightOfLangModelVsSpatialModel < 0.0f) + ? scoringPolicy->getAdjustedWeightOfLangModelVsSpatialModel(traverseSession, + terminals.data(), terminalSize) + : weightOfLangModelVsSpatialModel; + outSuggestionResults->setWeightOfLangModelVsSpatialModel( + weightOfLangModelVsSpatialModelToOutputSuggestions); // Force autocorrection for obvious long multi-word suggestions when the top suggestion is // a long multiple words suggestion. // TODO: Implement a smarter auto-commit method for handling multi-word suggestions. @@ -65,16 +69,62 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; // Output suggestion results here for (auto &terminalDicNode : terminals) { outputSuggestionsOfDicNode(scoringPolicy, traverseSession, &terminalDicNode, - languageWeightToOutputSuggestions, boostExactMatches, forceCommitMultiWords, - outputSecondWordFirstLetterInputIndex, outSuggestionResults); + weightOfLangModelVsSpatialModelToOutputSuggestions, boostExactMatches, + forceCommitMultiWords, outputSecondWordFirstLetterInputIndex, outSuggestionResults); + } + scoringPolicy->getMostProbableString(traverseSession, + weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults); +} + +/* static */ bool SuggestionsOutputUtils::shouldBlockWord( + const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode, + const WordAttributes wordAttributes, const bool isLastWord) { + const bool currentWordExactMatch = + ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); + // When we have to block offensive words, non-exact matched offensive words should not be + // output. + const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords(); + + const bool isBlockedOffensiveWord = shouldBlockOffensiveWords && + wordAttributes.isPossiblyOffensive(); + + // This function is called in two situations: + // + // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode + // of the search, and isLastWord will be true. + // "fuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=true) + // In this case, if the current word is an exact match, we will always let the word + // through, even if the user is blocking offensive words (it's exactly what they typed!) + // + // 2) In the middle of the search, when we hit a terminal node, to decide whether or not + // to start a new search at root, to try to match the rest of the input. In this case, + // terminalDicNode will point to the terminal node we just hit, and isLastWord will be + // false. + // "fuckvthis" + // | + // \ terminalDicNode (isLastWord=false, currentWordExactMatch=true) + // + // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this" + // when offensive words are blocked would be a bad idea). + // + // In the case of a multi-word correction where the offensive word is typed last (eg. + // for the input "allfuck"), this function will be called with isLastWord==true, but + // currentWordExactMatch==false. So we are OK in this case as well. + // "allfuck" + // | + // \ terminalDicNode (isLastWord=true, currentWordExactMatch=false) + if (isLastWord && currentWordExactMatch) { + return false; + } else { + return isBlockedOffensiveWord; } - scoringPolicy->getMostProbableString(traverseSession, languageWeightToOutputSuggestions, - outSuggestionResults); } /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode( const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, - const DicNode *const terminalDicNode, const float languageWeight, + const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel, const bool boostExactMatches, const bool forceCommitMultiWords, const bool outputSecondWordFirstLetterInputIndex, SuggestionResults *const outSuggestionResults) { @@ -83,34 +133,32 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; } const float doubleLetterCost = scoringPolicy->getDoubleLetterDemotionDistanceCost(terminalDicNode); - const float compoundDistance = terminalDicNode->getCompoundDistance(languageWeight) - + doubleLetterCost; - const bool isPossiblyOffensiveWord = - traverseSession->getDictionaryStructurePolicy()->getProbability( - terminalDicNode->getProbability(), NOT_A_PROBABILITY) <= 0; + const float compoundDistance = + terminalDicNode->getCompoundDistance(weightOfLangModelVsSpatialModel) + + doubleLetterCost; + const WordAttributes wordAttributes = traverseSession->getDictionaryStructurePolicy() + ->getWordAttributesInContext(terminalDicNode->getPrevWordIds(), + terminalDicNode->getWordId(), nullptr /* multiBigramMap */); const bool isExactMatch = ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes()); const bool isExactMatchWithIntentionalOmission = ErrorTypeUtils::isExactMatchWithIntentionalOmission( terminalDicNode->getContainedErrorTypes()); - const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase(); - // Heuristic: We exclude probability=0 first-char-uppercase words from exact match. - // (e.g. "AMD" and "and") - const bool isSafeExactMatch = isExactMatch - && !(isPossiblyOffensiveWord && isFirstCharUppercase); + // TODO: Decide whether the word should be auto-corrected or not here. + const bool isAppropriateForAutoCorrection = !ErrorTypeUtils::isMissingExplicitAccent( + terminalDicNode->getContainedErrorTypes()); const int outputTypeFlags = - (isPossiblyOffensiveWord ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) - | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) + (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0) + | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0) | (isExactMatchWithIntentionalOmission ? - Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0); - + Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0) + | (isAppropriateForAutoCorrection ? + Dictionary::KIND_FLAG_APPROPRIATE_FOR_AUTOCORRECTION : 0); // Entries that are blacklisted or do not represent a word should not be output. - const bool isValidWord = !terminalDicNode->isBlacklistedOrNotAWord(); - // When we have to block offensive words, non-exact matched offensive words should not be - // output. - const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords(); - const bool isBlockedOffensiveWord = blockOffensiveWords && isPossiblyOffensiveWord - && !isSafeExactMatch; + const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord()); + + const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(), + terminalDicNode, wordAttributes, true /* isLastWord */); // Increase output score of top typing suggestion to ensure autocorrection. // TODO: Better integration with java side autocorrection logic. @@ -118,11 +166,11 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; compoundDistance, traverseSession->getInputSize(), terminalDicNode->getContainedErrorTypes(), (forceCommitMultiWords && terminalDicNode->hasMultipleWords()), - boostExactMatches); + boostExactMatches, wordAttributes.getProbability() == 0); // Don't output invalid or blocked offensive words. However, we still need to submit their // shortcuts if any. - if (isValidWord && !isBlockedOffensiveWord) { + if (isValidWord && !shouldBlockThisWord) { int codePoints[MAX_WORD_LENGTH]; terminalDicNode->outputResult(codePoints); const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ? @@ -139,10 +187,9 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16; // Shortcut is not supported for multiple words suggestions. // TODO: Check shortcuts during traversal for multiple words suggestions. if (!terminalDicNode->hasMultipleWords()) { - BinaryDictionaryShortcutIterator shortcutIt( - traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), - traverseSession->getDictionaryStructurePolicy() - ->getShortcutPositionOfPtNode(terminalDicNode->getPtNodePos())); + BinaryDictionaryShortcutIterator shortcutIt = + traverseSession->getDictionaryStructurePolicy()->getShortcutIterator( + terminalDicNode->getWordId()); const bool sameAsTyped = scoringPolicy->sameAsTyped(traverseSession, terminalDicNode); outputShortcuts(&shortcutIt, finalScore, sameAsTyped, outSuggestionResults); } diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h index b099b4776..bcb75a483 100644 --- a/native/jni/src/suggest/core/result/suggestions_output_utils.h +++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h @@ -18,6 +18,7 @@ #define LATINIME_SUGGESTIONS_OUTPUT_UTILS #include "defines.h" +#include "dictionary/property/word_attributes.h" namespace latinime { @@ -25,15 +26,23 @@ class BinaryDictionaryShortcutIterator; class DicNode; class DicTraverseSession; class Scoring; +class SuggestOptions; class SuggestionResults; class SuggestionsOutputUtils { public: /** + * Returns true if we should block the incoming word, in the context of the user's + * preferences to include or not include possibly offensive words + */ + static bool shouldBlockWord(const SuggestOptions *const suggestOptions, + const DicNode *const terminalDicNode, const WordAttributes wordAttributes, + const bool isLastWord); + /** * Outputs the final list of suggestions (i.e., terminal nodes). */ static void outputSuggestions(const Scoring *const scoringPolicy, - DicTraverseSession *traverseSession, const float languageWeight, + DicTraverseSession *traverseSession, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults); private: @@ -44,7 +53,7 @@ class SuggestionsOutputUtils { static void outputSuggestionsOfDicNode(const Scoring *const scoringPolicy, DicTraverseSession *traverseSession, const DicNode *const terminalDicNode, - const float languageWeight, const bool boostExactMatches, + const float weightOfLangModelVsSpatialModel, const bool boostExactMatches, const bool forceCommitMultiWords, const bool outputSecondWordFirstLetterInputIndex, SuggestionResults *const outSuggestionResults); static void outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index f1e411f38..d7dd5a02d 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -17,10 +17,10 @@ #include "suggest/core/session/dic_traverse_session.h" #include "defines.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/ngram_context.h" #include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/core/session/prev_words_info.h" namespace latinime { @@ -30,13 +30,13 @@ const int DicTraverseSession::DICTIONARY_SIZE_THRESHOLD_TO_USE_LARGE_CACHE_FOR_S 256 * 1024; void DicTraverseSession::init(const Dictionary *const dictionary, - const PrevWordsInfo *const prevWordsInfo, const SuggestOptions *const suggestOptions) { + const NgramContext *const ngramContext, const SuggestOptions *const suggestOptions) { mDictionary = dictionary; mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() ->getMultiWordCostMultiplier(); mSuggestOptions = suggestOptions; - prevWordsInfo->getPrevWordsTerminalPtNodePos( - getDictionaryStructurePolicy(), mPrevWordsPtNodePos, true /* tryLowerCaseSearch */); + mPrevWordIdCount = ngramContext->getPrevWordIds(getDictionaryStructurePolicy(), + &mPrevWordIdArray, true /* tryLowerCaseSearch */).size(); } void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, @@ -69,8 +69,12 @@ void DicTraverseSession::initializeProximityInfoStates(const int *const inputCod for (int i = 0; i < maxPointerCount; ++i) { mProximityInfoStates[i].initInputParams(i, maxSpatialDistance, getProximityInfo(), inputCodePoints, inputSize, inputXs, inputYs, times, pointerIds, - maxPointerCount == MAX_POINTER_COUNT_G - /* TODO: this is a hack. fix proximity info state */); + // Right now the line below is trying to figure out whether this is a gesture by + // looking at the pointer count and assuming whatever is above the cutoff is + // a gesture and whatever is below is type. This is hacky and incorrect, we + // should pass the correct information instead. + maxPointerCount == MAX_POINTER_COUNT_G, + getDictionaryStructurePolicy()->getHeaderStructurePolicy()->getLocale()); mInputSize += mProximityInfoStates[i].size(); } } diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index 5a51a112d..f5fcfddcd 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -20,16 +20,17 @@ #include <vector> #include "defines.h" +#include "dictionary/utils/multi_bigram_map.h" #include "jni.h" #include "suggest/core/dicnode/dic_nodes_cache.h" -#include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/layout/proximity_info_state.h" +#include "utils/int_array_view.h" namespace latinime { class Dictionary; class DictionaryStructureWithBufferPolicy; -class PrevWordsInfo; +class NgramContext; class ProximityInfo; class SuggestOptions; @@ -50,20 +51,17 @@ class DicTraverseSession { } AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr, bool usesLargeCache) - : mProximityInfo(nullptr), mDictionary(nullptr), mSuggestOptions(nullptr), - mDicNodesCache(usesLargeCache), mMultiBigramMap(), mInputSize(0), mMaxPointerCount(1), - mMultiWordCostMultiplier(1.0f) { + : mPrevWordIdCount(0), mProximityInfo(nullptr), mDictionary(nullptr), + mSuggestOptions(nullptr), mDicNodesCache(usesLargeCache), mMultiBigramMap(), + mInputSize(0), mMaxPointerCount(1), mMultiWordCostMultiplier(1.0f) { // NOTE: mProximityInfoStates is an array of instances. // No need to initialize it explicitly here. - for (size_t i = 0; i < NELEMS(mPrevWordsPtNodePos); ++i) { - mPrevWordsPtNodePos[i] = NOT_A_DICT_POS; - } } // Non virtual inline destructor -- never inherit this class AK_FORCE_INLINE ~DicTraverseSession() {} - void init(const Dictionary *dictionary, const PrevWordsInfo *const prevWordsInfo, + void init(const Dictionary *dictionary, const NgramContext *const ngramContext, const SuggestOptions *const suggestOptions); // TODO: Remove and merge into init void setupForGetSuggestions(const ProximityInfo *pInfo, const int *inputCodePoints, @@ -79,7 +77,9 @@ class DicTraverseSession { //-------------------- const ProximityInfo *getProximityInfo() const { return mProximityInfo; } const SuggestOptions *getSuggestOptions() const { return mSuggestOptions; } - const int *getPrevWordsPtNodePos() const { return mPrevWordsPtNodePos; } + const WordIdArrayView getPrevWordIds() const { + return WordIdArrayView::fromArray(mPrevWordIdArray).limit(mPrevWordIdCount); + } DicNodesCache *getDicTraverseCache() { return &mDicNodesCache; } MultiBigramMap *getMultiBigramMap() { return &mMultiBigramMap; } const ProximityInfoState *getProximityInfoState(int id) const { @@ -166,7 +166,8 @@ class DicTraverseSession { const int *const inputYs, const int *const times, const int *const pointerIds, const int inputSize, const float maxSpatialDistance, const int maxPointerCount); - int mPrevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + WordIdArray<MAX_PREV_WORD_COUNT_FOR_N_GRAM> mPrevWordIdArray; + size_t mPrevWordIdCount; const ProximityInfo *mProximityInfo; const Dictionary *mDictionary; const SuggestOptions *mSuggestOptions; diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h deleted file mode 100644 index e44e876e9..000000000 --- a/native/jni/src/suggest/core/session/prev_words_info.h +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (C) 2014 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_PREV_WORDS_INFO_H -#define LATINIME_PREV_WORDS_INFO_H - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "utils/char_utils.h" - -namespace latinime { - -// TODO: Support n-gram. -class PrevWordsInfo { - public: - // No prev word information. - PrevWordsInfo() { - clear(); - } - - PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) { - for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i]; - memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i], - sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); - mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i]; - } - } - - // Construct from previous words. - PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH], - const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, - const size_t prevWordCount) { - clear(); - for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) { - if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { - continue; - } - memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], - sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); - mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; - mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; - } - } - - // Construct from a previous word. - PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, - const bool isBeginningOfSentence) { - clear(); - if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { - return; - } - memmove(mPrevWordCodePoints[0], prevWordCodePoints, - sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); - mPrevWordCodePointCount[0] = prevWordCodePointCount; - mIsBeginningOfSentence[0] = isBeginningOfSentence; - } - - bool isValid() const { - if (mPrevWordCodePointCount[0] > 0) { - return true; - } - if (mIsBeginningOfSentence[0]) { - return true; - } - return false; - } - - void getPrevWordsTerminalPtNodePos( - const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - int *const outPrevWordsTerminalPtNodePos, const bool tryLowerCaseSearch) const { - for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - outPrevWordsTerminalPtNodePos[i] = getTerminalPtNodePosOfWord(dictStructurePolicy, - mPrevWordCodePoints[i], mPrevWordCodePointCount[i], - mIsBeginningOfSentence[i], tryLowerCaseSearch); - } - } - - // n is 1-indexed. - const int *getNthPrevWordCodePoints(const int n) const { - if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { - return nullptr; - } - return mPrevWordCodePoints[n - 1]; - } - - // n is 1-indexed. - int getNthPrevWordCodePointCount(const int n) const { - if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { - return 0; - } - return mPrevWordCodePointCount[n - 1]; - } - - // n is 1-indexed. - bool isNthPrevWordBeginningOfSentence(const int n) const { - if (n <= 0 || n > MAX_PREV_WORD_COUNT_FOR_N_GRAM) { - return false; - } - return mIsBeginningOfSentence[n - 1]; - } - - private: - DISALLOW_COPY_AND_ASSIGN(PrevWordsInfo); - - static int getTerminalPtNodePosOfWord( - const DictionaryStructureWithBufferPolicy *const dictStructurePolicy, - const int *const wordCodePoints, const int wordCodePointCount, - const bool isBeginningOfSentence, const bool tryLowerCaseSearch) { - if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) { - return NOT_A_DICT_POS; - } - int codePoints[MAX_WORD_LENGTH]; - int codePointCount = wordCodePointCount; - memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount); - if (isBeginningOfSentence) { - codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, - codePointCount, MAX_WORD_LENGTH); - if (codePointCount <= 0) { - return NOT_A_DICT_POS; - } - } - const int wordPtNodePos = dictStructurePolicy->getTerminalPtNodePositionOfWord( - codePoints, codePointCount, false /* forceLowerCaseSearch */); - if (wordPtNodePos != NOT_A_DICT_POS || !tryLowerCaseSearch) { - // Return the position when when the word was found or doesn't try lower case - // search. - return wordPtNodePos; - } - // Check bigrams for lower-cased previous word if original was not found. Useful for - // auto-capitalized words like "The [current_word]". - return dictStructurePolicy->getTerminalPtNodePositionOfWord( - codePoints, codePointCount, true /* forceLowerCaseSearch */); - } - - void clear() { - for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - mPrevWordCodePointCount[i] = 0; - mIsBeginningOfSentence[i] = false; - } - } - - int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; - int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; -}; -} // namespace latinime -#endif // LATINIME_PREV_WORDS_INFO_H diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 0cd305f5a..52fa5a5db 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -16,17 +16,20 @@ #include "suggest/core/suggest.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/property/word_attributes.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" #include "suggest/core/layout/proximity_info.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/policy/weighting.h" #include "suggest/core/result/suggestions_output_utils.h" #include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" +#include "utils/profiler.h" namespace latinime { @@ -44,10 +47,10 @@ const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2; */ void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, - int inputSize, const float languageWeight, + int inputSize, const float weightOfLangModelVsSpatialModel, SuggestionResults *const outSuggestionResults) const { - PROF_OPEN; - PROF_START(0); + PROF_INIT; + PROF_TIMER_START(0); const float maxSpatialDistance = TRAVERSAL->getMaxSpatialDistance(); DicTraverseSession *tSession = static_cast<DicTraverseSession *>(traverseSession); tSession->setupForGetSuggestions(pInfo, inputCodePoints, inputSize, inputXs, inputYs, times, @@ -55,8 +58,8 @@ void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, // TODO: Add the way to evaluate cache initializeSearch(tSession); - PROF_END(0); - PROF_START(1); + PROF_TIMER_END(0); + PROF_TIMER_START(1); // keep expanding search dicNodes until all have terminated. while (tSession->getDicTraverseCache()->activeSize() > 0) { @@ -64,12 +67,11 @@ void Suggest::getSuggestions(ProximityInfo *pInfo, void *traverseSession, tSession->getDicTraverseCache()->advanceActiveDicNodes(); tSession->getDicTraverseCache()->advanceInputIndex(inputSize); } - PROF_END(1); - PROF_START(2); + PROF_TIMER_END(1); + PROF_TIMER_START(2); SuggestionsOutputUtils::outputSuggestions( - SCORING, tSession, languageWeight, outSuggestionResults); - PROF_END(2); - PROF_CLOSE; + SCORING, tSession, weightOfLangModelVsSpatialModel, outSuggestionResults); + PROF_TIMER_END(2); } /** @@ -87,12 +89,13 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession) const { traverseSession->getDicTraverseCache()->continueSearch(); } else { // Restart recognition at the root. - traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize()), + traverseSession->resetCache(TRAVERSAL->getMaxCacheSize(traverseSession->getInputSize(), + traverseSession->getSuggestOptions()->weightForLocale()), TRAVERSAL->getTerminalCacheSize()); // Create a new dic node here DicNode rootNode; DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), - traverseSession->getPrevWordsPtNodePos(), &rootNode); + traverseSession->getPrevWordIds(), &rootNode); traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); } } @@ -157,8 +160,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { // TODO: Remove. Do not prune node here. const bool allowsErrorCorrections = TRAVERSAL->allowsErrorCorrections(&dicNode); // Process for handling space substitution (e.g., hevis => he is) - if (allowsErrorCorrections - && TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) { + if (TRAVERSAL->isSpaceSubstitutionTerminal(traverseSession, &dicNode)) { createNextWordDicNode(traverseSession, &dicNode, true /* spaceSubstitution */); } @@ -281,7 +283,6 @@ void Suggest::processDicNodeAsAdditionalProximityChar(DicTraverseSession *traver // not treat the node as a terminal. There is no need to pass the bigram map in these cases. Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_ADDITIONAL_PROXIMITY, traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); - weightChildNode(traverseSession, childDicNode); processExpandedDicNode(traverseSession, childDicNode); } @@ -289,7 +290,6 @@ void Suggest::processDicNodeAsSubstitution(DicTraverseSession *traverseSession, DicNode *dicNode, DicNode *childDicNode) const { Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SUBSTITUTION, traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); - weightChildNode(traverseSession, childDicNode); processExpandedDicNode(traverseSession, childDicNode); } @@ -400,7 +400,7 @@ void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicN if (dicNode->isCompletion(inputSize)) { Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_COMPLETION, traverseSession, 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); - } else { // completion + } else { Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_MATCH, traverseSession, 0 /* parentDicNode */, dicNode, 0 /* multiBigramMap */); } @@ -412,7 +412,16 @@ void Suggest::weightChildNode(DicTraverseSession *traverseSession, DicNode *dicN */ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode *dicNode, const bool spaceSubstitution) const { - if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode)) { + const WordAttributes wordAttributes = + traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext( + dicNode->getPrevWordIds(), dicNode->getWordId(), + traverseSession->getMultiBigramMap()); + if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(), + dicNode, wordAttributes, false /* isLastWord */)) { + return; + } + + if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) { return; } diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h index 788e0314b..65d5918cf 100644 --- a/native/jni/src/suggest/core/suggest.h +++ b/native/jni/src/suggest/core/suggest.h @@ -49,7 +49,8 @@ class Suggest : public SuggestInterface { AK_FORCE_INLINE virtual ~Suggest() {} void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, - const float languageWeight, SuggestionResults *const outSuggestionResults) const; + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const; private: DISALLOW_IMPLICIT_CONSTRUCTORS(Suggest); diff --git a/native/jni/src/suggest/core/suggest_interface.h b/native/jni/src/suggest/core/suggest_interface.h index a6e5aefae..a05aa9c80 100644 --- a/native/jni/src/suggest/core/suggest_interface.h +++ b/native/jni/src/suggest/core/suggest_interface.h @@ -28,7 +28,8 @@ class SuggestInterface { public: virtual void getSuggestions(ProximityInfo *pInfo, void *traverseSession, int *inputXs, int *inputYs, int *times, int *pointerIds, int *inputCodePoints, int inputSize, - const float languageWeight, SuggestionResults *const suggestionResults) const = 0; + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const suggestionResults) const = 0; SuggestInterface() {} virtual ~SuggestInterface() {} private: diff --git a/native/jni/src/suggest/core/suggest_options.h b/native/jni/src/suggest/core/suggest_options.h index d456680dd..4d331292b 100644 --- a/native/jni/src/suggest/core/suggest_options.h +++ b/native/jni/src/suggest/core/suggest_options.h @@ -42,6 +42,12 @@ class SuggestOptions{ return getBoolOption(SPACE_AWARE_GESTURE_ENABLED); } + AK_FORCE_INLINE float weightForLocale() const { + // The weight is in thousands and we want the real value, so we divide by 1000. + // NativeSuggestOptions#setWeightForLocale does the opposite processing in Java. + return static_cast<float>(getIntOption(WEIGHT_FOR_LOCALE_IN_THOUSANDS)) / 1000.0f; + } + AK_FORCE_INLINE bool getAdditionalFeaturesBoolOption(const int key) const { return getBoolOption(key + ADDITIONAL_FEATURES_OPTIONS); } @@ -55,9 +61,10 @@ class SuggestOptions{ static const int USE_FULL_EDIT_DISTANCE = 1; static const int BLOCK_OFFENSIVE_WORDS = 2; static const int SPACE_AWARE_GESTURE_ENABLED = 3; + static const int WEIGHT_FOR_LOCALE_IN_THOUSANDS = 4; // Additional features options are stored after the other options and used as setting values of // experimental features. - static const int ADDITIONAL_FEATURES_OPTIONS = 4; + static const int ADDITIONAL_FEATURES_OPTIONS = 5; const int *const mOptions; const int mLength; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp deleted file mode 100644 index 08dc107ab..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" - -#include "suggest/core/dictionary/property/bigram_property.h" -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -void Ver4BigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const bigramEntryPos) const { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(bigramEntryPos); - if (outBigramPos) { - // Lookup target PtNode position. - *outBigramPos = mTerminalPositionLookupTable->getTerminalPtNodePosition( - bigramEntry.getTargetTerminalId()); - } - if (outProbability) { - if (bigramEntry.hasHistoricalInfo()) { - *outProbability = - ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo(), - mHeaderPolicy); - } else { - *outProbability = bigramEntry.getProbability(); - } - } - if (outHasNext) { - *outHasNext = bigramEntry.hasNext(); - } -} - -bool Ver4BigramListPolicy::addNewEntry(const int terminalId, const int newTargetTerminalId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry) { - // 1. The word has no bigrams yet. - // 2. The word has bigrams, and there is the target in the list. - // 3. The word has bigrams, and there is an invalid entry that can be reclaimed. - // 4. The word has bigrams. We have to append new bigram entry to the list. - // 5. Same as 4, but the list is the last entry of the content file. - if (outAddedNewEntry) { - *outAddedNewEntry = false; - } - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Case 1. PtNode that doesn't have a bigram list. - // Create new bigram list. - if (!mBigramDictContent->createNewBigramList(terminalId)) { - return false; - } - const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom(&newBigramEntry, - bigramProperty); - // Write an entry. - int writingPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite, - &writingPos)) { - AKLOGE("Cannot write bigram entry. pos: %d.", writingPos); - return false; - } - if (!mBigramDictContent->writeTerminator(writingPos)) { - AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos); - return false; - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - return true; - } - - int tailEntryPos = NOT_A_DICT_POS; - const int entryPosToUpdate = getEntryPosToUpdate(newTargetTerminalId, bigramListPos, - &tailEntryPos); - if (entryPosToUpdate == NOT_A_DICT_POS) { - // Case 4, 5. Add new entry to the bigram list. - const int contentTailPos = mBigramDictContent->getContentTailPos(); - // If the tail entry is at the tail of content buffer, the new entry can be written without - // link (Case 5). - const bool canAppendEntry = - contentTailPos == tailEntryPos + mBigramDictContent->getBigramEntrySize(); - const int newEntryPos = canAppendEntry ? tailEntryPos : contentTailPos; - int writingPos = newEntryPos; - // Write new entry at the tail position of the bigram content. - const BigramEntry newBigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &newBigramEntry, bigramProperty); - if (!mBigramDictContent->writeBigramEntryAndAdvancePosition(&bigramEntryToWrite, - &writingPos)) { - AKLOGE("Cannot write bigram entry. pos: %d.", writingPos); - return false; - } - if (!mBigramDictContent->writeTerminator(writingPos)) { - AKLOGE("Cannot write bigram list terminator. pos: %d.", writingPos); - return false; - } - if (!canAppendEntry) { - // Update link of the current tail entry. - if (!mBigramDictContent->writeLink(newEntryPos, tailEntryPos)) { - AKLOGE("Cannot update bigram entry link. pos: %d, linked entry pos: %d.", - tailEntryPos, newEntryPos); - return false; - } - } - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - return true; - } - - // Case 2. Overwrite the existing entry. Case 3. Reclaim and reuse the existing invalid entry. - const BigramEntry originalBigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (!originalBigramEntry.isValid()) { - // Case 3. Reuse the existing invalid entry. outAddedNewEntry is false when an existing - // entry is updated. - if (outAddedNewEntry) { - *outAddedNewEntry = true; - } - } - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(newTargetTerminalId); - const BigramEntry bigramEntryToWrite = createUpdatedBigramEntryFrom( - &updatedBigramEntry, bigramProperty); - return mBigramDictContent->writeBigramEntry(&bigramEntryToWrite, entryPosToUpdate); -} - -bool Ver4BigramListPolicy::removeEntry(const int terminalId, const int targetTerminalId) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return false; - } - const int entryPosToUpdate = getEntryPosToUpdate(targetTerminalId, bigramListPos, - nullptr /* outTailEntryPos */); - if (entryPosToUpdate == NOT_A_DICT_POS) { - // Bigram entry doesn't exist. - return false; - } - const BigramEntry bigramEntry = mBigramDictContent->getBigramEntry(entryPosToUpdate); - if (targetTerminalId != bigramEntry.getTargetTerminalId()) { - // Bigram entry doesn't exist. - return false; - } - // Remove bigram entry by marking it as invalid entry and overwriting the original entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - return mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPosToUpdate); -} - -bool Ver4BigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, - int *const outBigramCount) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return true; - } - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - const int entryPos = readingPos - mBigramDictContent->getBigramEntrySize(); - hasNext = bigramEntry.hasNext(); - if (!bigramEntry.isValid()) { - continue; - } - const int targetPtNodePos = mTerminalPositionLookupTable->getTerminalPtNodePosition( - bigramEntry.getTargetTerminalId()); - if (targetPtNodePos == NOT_A_DICT_POS) { - // Invalidate bigram entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - } else if (bigramEntry.hasHistoricalInfo()) { - const HistoricalInfo historicalInfo = ForgettingCurveUtils::createHistoricalInfoToSave( - bigramEntry.getHistoricalInfo(), mHeaderPolicy); - if (ForgettingCurveUtils::needsToKeep(&historicalInfo, mHeaderPolicy)) { - const BigramEntry updatedBigramEntry = - bigramEntry.updateHistoricalInfoAndGetEntry(&historicalInfo); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - *outBigramCount += 1; - } else { - // Remove entry. - const BigramEntry updatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!mBigramDictContent->writeBigramEntry(&updatedBigramEntry, entryPos)) { - return false; - } - } - } else { - *outBigramCount += 1; - } - } - return true; -} - -int Ver4BigramListPolicy::getBigramEntryConut(const int terminalId) { - const int bigramListPos = mBigramDictContent->getBigramListHeadPos(terminalId); - if (bigramListPos == NOT_A_DICT_POS) { - // Bigram list doesn't exist. - return 0; - } - int bigramCount = 0; - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - if (bigramEntry.isValid()) { - bigramCount++; - } - } - return bigramCount; -} - -int Ver4BigramListPolicy::getEntryPosToUpdate(const int targetTerminalIdToFind, - const int bigramListPos, int *const outTailEntryPos) const { - if (outTailEntryPos) { - *outTailEntryPos = NOT_A_DICT_POS; - } - int invalidEntryPos = NOT_A_DICT_POS; - int readingPos = bigramListPos; - while (true) { - const BigramEntry bigramEntry = - mBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - const int entryPos = readingPos - mBigramDictContent->getBigramEntrySize(); - if (!bigramEntry.hasNext()) { - if (outTailEntryPos) { - *outTailEntryPos = entryPos; - } - break; - } - if (bigramEntry.getTargetTerminalId() == targetTerminalIdToFind) { - // Entry with same target is found. - return entryPos; - } else if (!bigramEntry.isValid()) { - // Invalid entry that can be reused is found. - invalidEntryPos = entryPos; - } - } - return invalidEntryPos; -} - -const BigramEntry Ver4BigramListPolicy::createUpdatedBigramEntryFrom( - const BigramEntry *const originalBigramEntry, - const BigramProperty *const bigramProperty) const { - // TODO: Consolidate historical info and probability. - if (mHeaderPolicy->hasHistoricalInfoOfWords()) { - const HistoricalInfo historicalInfoForUpdate(bigramProperty->getTimestamp(), - bigramProperty->getLevel(), bigramProperty->getCount()); - const HistoricalInfo updatedHistoricalInfo = - ForgettingCurveUtils::createUpdatedHistoricalInfo( - originalBigramEntry->getHistoricalInfo(), bigramProperty->getProbability(), - &historicalInfoForUpdate, mHeaderPolicy); - return originalBigramEntry->updateHistoricalInfoAndGetEntry(&updatedHistoricalInfo); - } else { - return originalBigramEntry->updateProbabilityAndGetEntry(bigramProperty->getProbability()); - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h deleted file mode 100644 index 4b3bb3725..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_VER4_BIGRAM_LIST_POLICY_H -#define LATINIME_VER4_BIGRAM_LIST_POLICY_H - -#include "defines.h" -#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h" - -namespace latinime { - -class BigramDictContent; -class BigramProperty; -class HeaderPolicy; -class TerminalPositionLookupTable; - -class Ver4BigramListPolicy : public DictionaryBigramsStructurePolicy { - public: - Ver4BigramListPolicy(BigramDictContent *const bigramDictContent, - const TerminalPositionLookupTable *const terminalPositionLookupTable, - const HeaderPolicy *const headerPolicy) - : mBigramDictContent(bigramDictContent), - mTerminalPositionLookupTable(terminalPositionLookupTable), - mHeaderPolicy(headerPolicy) {} - - void getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const bigramEntryPos) const; - - bool skipAllBigrams(int *const pos) const { - // Do nothing because we don't need to skip bigram lists in ver4 dictionaries. - return true; - } - - bool addNewEntry(const int terminalId, const int newTargetTerminalId, - const BigramProperty *const bigramProperty, bool *const outAddedNewEntry); - - bool removeEntry(const int terminalId, const int targetTerminalId); - - bool updateAllBigramEntriesAndDeleteUselessEntries(const int terminalId, - int *const outBigramCount); - - int getBigramEntryConut(const int terminalId); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4BigramListPolicy); - - int getEntryPosToUpdate(const int targetTerminalIdToFind, const int bigramListPos, - int *const outTailEntryPos) const; - - const BigramEntry createUpdatedBigramEntryFrom(const BigramEntry *const originalBigramEntry, - const BigramProperty *const bigramProperty) const; - - BigramDictContent *const mBigramDictContent; - const TerminalPositionLookupTable *const mTerminalPositionLookupTable; - const HeaderPolicy *const mHeaderPolicy; -}; -} // namespace latinime -#endif /* LATINIME_VER4_BIGRAM_LIST_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp deleted file mode 100644 index d7e1952b5..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.cpp +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h" - -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" - -namespace latinime { - -const int BigramDictContent::INVALID_LINKED_ENTRY_POS = Ver4DictConstants::NOT_A_TERMINAL_ID; - -const BigramEntry BigramDictContent::getBigramEntryAndAdvancePosition( - int *const bigramEntryPos) const { - const BufferWithExtendableBuffer *const bigramListBuffer = getContentBuffer(); - const int bigramEntryTailPos = (*bigramEntryPos) + getBigramEntrySize(); - if (*bigramEntryPos < 0 || bigramEntryTailPos > bigramListBuffer->getTailPosition()) { - AKLOGE("Invalid bigram entry position. bigramEntryPos: %d, bigramEntryTailPos: %d, " - "bufSize: %d", *bigramEntryPos, bigramEntryTailPos, - bigramListBuffer->getTailPosition()); - ASSERT(false); - return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - Ver4DictConstants::NOT_A_TERMINAL_ID); - } - const int bigramFlags = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, bigramEntryPos); - const bool isLink = (bigramFlags & Ver4DictConstants::BIGRAM_IS_LINK_MASK) != 0; - int probability = NOT_A_PROBABILITY; - int timestamp = NOT_A_TIMESTAMP; - int level = 0; - int count = 0; - if (mHasHistoricalInfo) { - timestamp = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, bigramEntryPos); - level = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, bigramEntryPos); - count = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, bigramEntryPos); - } else { - probability = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::PROBABILITY_SIZE, bigramEntryPos); - } - const int encodedTargetTerminalId = bigramListBuffer->readUintAndAdvancePosition( - Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, bigramEntryPos); - const int targetTerminalId = - (encodedTargetTerminalId == Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID) ? - Ver4DictConstants::NOT_A_TERMINAL_ID : encodedTargetTerminalId; - if (isLink) { - const int linkedEntryPos = targetTerminalId; - if (linkedEntryPos == INVALID_LINKED_ENTRY_POS) { - // Bigram list terminator is found. - return BigramEntry(false /* hasNext */, NOT_A_PROBABILITY, - Ver4DictConstants::NOT_A_TERMINAL_ID); - } - *bigramEntryPos = linkedEntryPos; - return getBigramEntryAndAdvancePosition(bigramEntryPos); - } - // hasNext is always true because we should continue to read the next entry until the terminator - // is found. - if (mHasHistoricalInfo) { - const HistoricalInfo historicalInfo(timestamp, level, count); - return BigramEntry(true /* hasNext */, probability, &historicalInfo, targetTerminalId); - } else { - return BigramEntry(true /* hasNext */, probability, targetTerminalId); - } -} - -bool BigramDictContent::writeBigramEntryAndAdvancePosition( - const BigramEntry *const bigramEntryToWrite, int *const entryWritingPos) { - return writeBigramEntryAttributesAndAdvancePosition(false /* isLink */, - bigramEntryToWrite->getProbability(), bigramEntryToWrite->getTargetTerminalId(), - bigramEntryToWrite->getHistoricalInfo()->getTimeStamp(), - bigramEntryToWrite->getHistoricalInfo()->getLevel(), - bigramEntryToWrite->getHistoricalInfo()->getCount(), - entryWritingPos); -} - -bool BigramDictContent::writeBigramEntryAttributesAndAdvancePosition( - const bool isLink, const int probability, const int targetTerminalId, - const int timestamp, const int level, const int count, int *const entryWritingPos) { - BufferWithExtendableBuffer *const bigramListBuffer = getWritableContentBuffer(); - const int bigramFlags = isLink ? Ver4DictConstants::BIGRAM_IS_LINK_MASK : 0; - if (!bigramListBuffer->writeUintAndAdvancePosition(bigramFlags, - Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram flags. pos: %d, flags: %x", *entryWritingPos, bigramFlags); - return false; - } - if (mHasHistoricalInfo) { - if (!bigramListBuffer->writeUintAndAdvancePosition(timestamp, - Ver4DictConstants::TIME_STAMP_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram timestamps. pos: %d, timestamp: %d", *entryWritingPos, - timestamp); - return false; - } - if (!bigramListBuffer->writeUintAndAdvancePosition(level, - Ver4DictConstants::WORD_LEVEL_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram level. pos: %d, level: %d", *entryWritingPos, - level); - return false; - } - if (!bigramListBuffer->writeUintAndAdvancePosition(count, - Ver4DictConstants::WORD_COUNT_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram count. pos: %d, count: %d", *entryWritingPos, - count); - return false; - } - } else { - if (!bigramListBuffer->writeUintAndAdvancePosition(probability, - Ver4DictConstants::PROBABILITY_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram probability. pos: %d, probability: %d", *entryWritingPos, - probability); - return false; - } - } - const int targetTerminalIdToWrite = (targetTerminalId == Ver4DictConstants::NOT_A_TERMINAL_ID) ? - Ver4DictConstants::INVALID_BIGRAM_TARGET_TERMINAL_ID : targetTerminalId; - if (!bigramListBuffer->writeUintAndAdvancePosition(targetTerminalIdToWrite, - Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE, entryWritingPos)) { - AKLOGE("Cannot write bigram target terminal id. pos: %d, target terminal id: %d", - *entryWritingPos, targetTerminalId); - return false; - } - return true; -} - -bool BigramDictContent::writeLink(const int linkedEntryPos, const int writingPos) { - const int targetTerminalId = linkedEntryPos; - int pos = writingPos; - return writeBigramEntryAttributesAndAdvancePosition(true /* isLink */, - NOT_A_PROBABILITY /* probability */, targetTerminalId, NOT_A_TIMESTAMP, 0 /* level */, - 0 /* count */, &pos); -} - -bool BigramDictContent::runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const BigramDictContent *const originalBigramDictContent, - int *const outBigramEntryCount) { - for (TerminalPositionLookupTable::TerminalIdMap::const_iterator it = terminalIdMap->begin(); - it != terminalIdMap->end(); ++it) { - const int originalBigramListPos = - originalBigramDictContent->getBigramListHeadPos(it->first); - if (originalBigramListPos == NOT_A_DICT_POS) { - // This terminal does not have a bigram list. - continue; - } - const int bigramListPos = getContentBuffer()->getTailPosition(); - int bigramEntryCount = 0; - // Copy bigram list with GC from original content. - if (!runGCBigramList(originalBigramListPos, originalBigramDictContent, bigramListPos, - terminalIdMap, &bigramEntryCount)) { - AKLOGE("Cannot complete GC for the bigram list. original pos: %d, pos: %d", - originalBigramListPos, bigramListPos); - return false; - } - if (bigramEntryCount == 0) { - // All bigram entries are useless. This terminal does not have a bigram list. - continue; - } - *outBigramEntryCount += bigramEntryCount; - // Set bigram list position to the lookup table. - if (!getUpdatableAddressLookupTable()->set(it->second, bigramListPos)) { - AKLOGE("Cannot set bigram list position. terminal id: %d, pos: %d", - it->second, bigramListPos); - return false; - } - } - return true; -} - -// Returns whether GC for the bigram list was succeeded or not. -bool BigramDictContent::runGCBigramList(const int bigramListPos, - const BigramDictContent *const sourceBigramDictContent, const int toPos, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - int *const outEntryCount) { - bool hasNext = true; - int readingPos = bigramListPos; - int writingPos = toPos; - while (hasNext) { - const BigramEntry originalBigramEntry = - sourceBigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = originalBigramEntry.hasNext(); - if (!originalBigramEntry.isValid()) { - continue; - } - TerminalPositionLookupTable::TerminalIdMap::const_iterator it = - terminalIdMap->find(originalBigramEntry.getTargetTerminalId()); - if (it == terminalIdMap->end()) { - // Target word has been removed. - continue; - } - const BigramEntry updatedBigramEntry = - originalBigramEntry.updateTargetTerminalIdAndGetEntry(it->second); - if (!writeBigramEntryAndAdvancePosition(&updatedBigramEntry, &writingPos)) { - AKLOGE("Cannot write bigram entry to run GC. pos: %d", writingPos); - return false; - } - *outEntryCount += 1; - } - if (*outEntryCount > 0) { - if (!writeTerminator(writingPos)) { - AKLOGE("Cannot write terminator to run GC. pos: %d", writingPos); - return false; - } - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h deleted file mode 100644 index 361dd2c74..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_dict_content.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_DICT_CONTENT_H -#define LATINIME_BIGRAM_DICT_CONTENT_H - -#include <cstdint> -#include <cstdio> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/sparse_table_dict_content.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" - -namespace latinime { - -class BigramDictContent : public SparseTableDictContent { - public: - BigramDictContent(uint8_t *const *buffers, const int *bufferSizes, const bool hasHistoricalInfo) - : SparseTableDictContent(buffers, bufferSizes, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), - mHasHistoricalInfo(hasHistoricalInfo) {} - - BigramDictContent(const bool hasHistoricalInfo) - : SparseTableDictContent(Ver4DictConstants::BIGRAM_ADDRESS_TABLE_BLOCK_SIZE, - Ver4DictConstants::BIGRAM_ADDRESS_TABLE_DATA_SIZE), - mHasHistoricalInfo(hasHistoricalInfo) {} - - int getContentTailPos() const { - return getContentBuffer()->getTailPosition(); - } - - const BigramEntry getBigramEntry(const int bigramEntryPos) const { - int readingPos = bigramEntryPos; - return getBigramEntryAndAdvancePosition(&readingPos); - } - - const BigramEntry getBigramEntryAndAdvancePosition(int *const bigramEntryPos) const; - - // Returns head position of bigram list for a PtNode specified by terminalId. - int getBigramListHeadPos(const int terminalId) const { - const SparseTable *const addressLookupTable = getAddressLookupTable(); - if (!addressLookupTable->contains(terminalId)) { - return NOT_A_DICT_POS; - } - return addressLookupTable->get(terminalId); - } - - bool writeBigramEntryAtTail(const BigramEntry *const bigramEntryToWrite) { - int writingPos = getContentBuffer()->getTailPosition(); - return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); - } - - bool writeBigramEntry(const BigramEntry *const bigramEntryToWrite, const int entryWritingPos) { - int writingPos = entryWritingPos; - return writeBigramEntryAndAdvancePosition(bigramEntryToWrite, &writingPos); - } - - bool writeBigramEntryAndAdvancePosition(const BigramEntry *const bigramEntryToWrite, - int *const entryWritingPos); - - bool writeTerminator(const int writingPos) { - // Terminator is a link to the invalid position. - return writeLink(INVALID_LINKED_ENTRY_POS, writingPos); - } - - bool writeLink(const int linkedPos, const int writingPos); - - bool createNewBigramList(const int terminalId) { - const int bigramListPos = getContentBuffer()->getTailPosition(); - return getUpdatableAddressLookupTable()->set(terminalId, bigramListPos); - } - - bool flushToFile(FILE *const file) const { - return flush(file); - } - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const BigramDictContent *const originalBigramDictContent, - int *const outBigramEntryCount); - - int getBigramEntrySize() const { - if (mHasHistoricalInfo) { - return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE - + Ver4DictConstants::TIME_STAMP_FIELD_SIZE - + Ver4DictConstants::WORD_LEVEL_FIELD_SIZE - + Ver4DictConstants::WORD_COUNT_FIELD_SIZE - + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - } else { - return Ver4DictConstants::BIGRAM_FLAGS_FIELD_SIZE - + Ver4DictConstants::PROBABILITY_SIZE - + Ver4DictConstants::BIGRAM_TARGET_TERMINAL_ID_FIELD_SIZE; - } - } - - private: - DISALLOW_COPY_AND_ASSIGN(BigramDictContent); - - static const int INVALID_LINKED_ENTRY_POS; - - bool writeBigramEntryAttributesAndAdvancePosition( - const bool isLink, const int probability, const int targetTerminalId, - const int timestamp, const int level, const int count, int *const entryWritingPos); - - bool runGCBigramList(const int bigramListPos, - const BigramDictContent *const sourceBigramDictContent, const int toPos, - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - int *const outEntryCount); - - bool mHasHistoricalInfo; -}; -} // namespace latinime -#endif /* LATINIME_BIGRAM_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h deleted file mode 100644 index 2b0cbd93b..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/bigram_entry.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_ENTRY_H -#define LATINIME_BIGRAM_ENTRY_H - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/historical_info.h" - -namespace latinime { - -class BigramEntry { - public: - BigramEntry(const BigramEntry& bigramEntry) - : mHasNext(bigramEntry.mHasNext), mProbability(bigramEntry.mProbability), - mHistoricalInfo(), mTargetTerminalId(bigramEntry.mTargetTerminalId) {} - - // Entry with historical information. - BigramEntry(const bool hasNext, const int probability, const int targetTerminalId) - : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(), - mTargetTerminalId(targetTerminalId) {} - - // Entry with historical information. - BigramEntry(const bool hasNext, const int probability, - const HistoricalInfo *const historicalInfo, const int targetTerminalId) - : mHasNext(hasNext), mProbability(probability), mHistoricalInfo(*historicalInfo), - mTargetTerminalId(targetTerminalId) {} - - const BigramEntry getInvalidatedEntry() const { - return updateTargetTerminalIdAndGetEntry(Ver4DictConstants::NOT_A_TERMINAL_ID); - } - - const BigramEntry updateHasNextAndGetEntry(const bool hasNext) const { - return BigramEntry(hasNext, mProbability, &mHistoricalInfo, mTargetTerminalId); - } - - const BigramEntry updateTargetTerminalIdAndGetEntry(const int newTargetTerminalId) const { - return BigramEntry(mHasNext, mProbability, &mHistoricalInfo, newTargetTerminalId); - } - - const BigramEntry updateProbabilityAndGetEntry(const int probability) const { - return BigramEntry(mHasNext, probability, &mHistoricalInfo, mTargetTerminalId); - } - - const BigramEntry updateHistoricalInfoAndGetEntry( - const HistoricalInfo *const historicalInfo) const { - return BigramEntry(mHasNext, mProbability, historicalInfo, mTargetTerminalId); - } - - bool isValid() const { - return mTargetTerminalId != Ver4DictConstants::NOT_A_TERMINAL_ID; - } - - bool hasNext() const { - return mHasNext; - } - - int getProbability() const { - return mProbability; - } - - bool hasHistoricalInfo() const { - return mHistoricalInfo.isValid(); - } - - const HistoricalInfo *getHistoricalInfo() const { - return &mHistoricalInfo; - } - - int getTargetTerminalId() const { - return mTargetTerminalId; - } - - private: - // Copy constructor is public to use this class as a type of return value. - DISALLOW_DEFAULT_CONSTRUCTOR(BigramEntry); - DISALLOW_ASSIGNMENT_OPERATOR(BigramEntry); - - const bool mHasNext; - const int mProbability; - const HistoricalInfo mHistoricalInfo; - const int mTargetTerminalId; -}; -} // namespace latinime -#endif /* LATINIME_BIGRAM_ENTRY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp deleted file mode 100644 index 5dc91ba10..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h" - -namespace latinime { - -bool LanguageModelDictContent::save(FILE *const file) const { - return mTrieMap.save(file); -} - -bool LanguageModelDictContent::runGC( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const LanguageModelDictContent *const originalContent, - int *const outNgramCount) { - return runGCInner(terminalIdMap, originalContent->mTrieMap.getEntriesInRootLevel(), - 0 /* nextLevelBitmapEntryIndex */, outNgramCount); -} - -ProbabilityEntry LanguageModelDictContent::getNgramProbabilityEntry( - const WordIdArrayView prevWordIds, const int wordId) const { - const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); - if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { - return ProbabilityEntry(); - } - const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); - if (!result.mIsValid) { - // Not found. - return ProbabilityEntry(); - } - return ProbabilityEntry::decode(result.mValue, mHasHistoricalInfo); -} - -bool LanguageModelDictContent::setNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int terminalId, const ProbabilityEntry *const probabilityEntry) { - const int bitmapEntryIndex = getBitmapEntryIndex(prevWordIds); - if (bitmapEntryIndex == TrieMap::INVALID_INDEX) { - return false; - } - return mTrieMap.put(terminalId, probabilityEntry->encode(mHasHistoricalInfo), bitmapEntryIndex); -} - -bool LanguageModelDictContent::runGCInner( - const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const TrieMap::TrieMapRange trieMapRange, - const int nextLevelBitmapEntryIndex, int *const outNgramCount) { - for (auto &entry : trieMapRange) { - const auto it = terminalIdMap->find(entry.key()); - if (it == terminalIdMap->end() || it->second == Ver4DictConstants::NOT_A_TERMINAL_ID) { - // The word has been removed. - continue; - } - if (!mTrieMap.put(it->second, entry.value(), nextLevelBitmapEntryIndex)) { - return false; - } - if (outNgramCount) { - *outNgramCount += 1; - } - if (entry.hasNextLevelMap()) { - if (!runGCInner(terminalIdMap, entry.getEntriesInNextLevel(), - mTrieMap.getNextLevelBitmapEntryIndex(it->second, nextLevelBitmapEntryIndex), - outNgramCount)) { - return false; - } - } - } - return true; -} - -int LanguageModelDictContent::getBitmapEntryIndex(const WordIdArrayView prevWordIds) const { - int bitmapEntryIndex = mTrieMap.getRootBitmapEntryIndex(); - for (const int wordId : prevWordIds) { - const TrieMap::Result result = mTrieMap.get(wordId, bitmapEntryIndex); - if (!result.mIsValid) { - return TrieMap::INVALID_INDEX; - } - bitmapEntryIndex = result.mNextLevelBitmapEntryIndex; - } - return bitmapEntryIndex; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h deleted file mode 100644 index 18f2e0170..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (C) 2014, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H -#define LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H - -#include <cstdio> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/probability_entry.h" -#include "suggest/policyimpl/dictionary/structure/v4/content/terminal_position_lookup_table.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/utils/trie_map.h" -#include "utils/byte_array_view.h" -#include "utils/int_array_view.h" - -namespace latinime { - -/** - * Class representing language model. - * - * This class provides methods to get and store unigram/n-gram probability information and flags. - */ -class LanguageModelDictContent { - public: - LanguageModelDictContent(const ReadWriteByteArrayView trieMapBuffer, - const bool hasHistoricalInfo) - : mTrieMap(trieMapBuffer), mHasHistoricalInfo(hasHistoricalInfo) {} - - explicit LanguageModelDictContent(const bool hasHistoricalInfo) - : mTrieMap(), mHasHistoricalInfo(hasHistoricalInfo) {} - - bool isNearSizeLimit() const { - return mTrieMap.isNearSizeLimit(); - } - - bool save(FILE *const file) const; - - bool runGC(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const LanguageModelDictContent *const originalContent, - int *const outNgramCount); - - ProbabilityEntry getProbabilityEntry(const int wordId) const { - return getNgramProbabilityEntry(WordIdArrayView(), wordId); - } - - bool setProbabilityEntry(const int wordId, const ProbabilityEntry *const probabilityEntry) { - return setNgramProbabilityEntry(WordIdArrayView(), wordId, probabilityEntry); - } - - ProbabilityEntry getNgramProbabilityEntry(const WordIdArrayView prevWordIds, - const int wordId) const; - - bool setNgramProbabilityEntry(const WordIdArrayView prevWordIds, const int wordId, - const ProbabilityEntry *const probabilityEntry); - - private: - DISALLOW_COPY_AND_ASSIGN(LanguageModelDictContent); - - TrieMap mTrieMap; - const bool mHasHistoricalInfo; - - bool runGCInner(const TerminalPositionLookupTable::TerminalIdMap *const terminalIdMap, - const TrieMap::TrieMapRange trieMapRange, const int nextLevelBitmapEntryIndex, - int *const outNgramCount); - - int getBitmapEntryIndex(const WordIdArrayView prevWordIds) const; -}; -} // namespace latinime -#endif /* LATINIME_LANGUAGE_MODEL_DICT_CONTENT_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp deleted file mode 100644 index 723808399..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ /dev/null @@ -1,551 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h" - -#include <vector> - -#include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/ngram_listener.h" -#include "suggest/core/dictionary/property/bigram_property.h" -#include "suggest/core/dictionary/property/unigram_property.h" -#include "suggest/core/dictionary/property/word_property.h" -#include "suggest/core/session/prev_words_info.h" -#include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" -#include "suggest/policyimpl/dictionary/utils/probability_utils.h" - -namespace latinime { - -// Note that there are corresponding definitions in Java side in BinaryDictionaryTests and -// BinaryDictionaryDecayingTests. -const char *const Ver4PatriciaTriePolicy::UNIGRAM_COUNT_QUERY = "UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::BIGRAM_COUNT_QUERY = "BIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_UNIGRAM_COUNT_QUERY = "MAX_UNIGRAM_COUNT"; -const char *const Ver4PatriciaTriePolicy::MAX_BIGRAM_COUNT_QUERY = "MAX_BIGRAM_COUNT"; -const int Ver4PatriciaTriePolicy::MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS = 1024; -const int Ver4PatriciaTriePolicy::MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS = - Ver4DictConstants::MAX_DICTIONARY_SIZE - MARGIN_TO_REFUSE_DYNAMIC_OPERATIONS; - -void Ver4PatriciaTriePolicy::createAndGetAllChildDicNodes(const DicNode *const dicNode, - DicNodeVector *const childDicNodes) const { - if (!dicNode->hasChildren()) { - return; - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPtNodeArrayPos()); - while (!readingHelper.isEnd()) { - const PtNodeParams ptNodeParams = readingHelper.getPtNodeParams(); - if (!ptNodeParams.isValid()) { - break; - } - bool isTerminal = ptNodeParams.isTerminal() && !ptNodeParams.isDeleted(); - if (isTerminal && mHeaderPolicy->isDecayingDict()) { - // A DecayingDict may have a terminal PtNode that has a terminal DicNode whose - // probability is NOT_A_PROBABILITY. In such case, we don't want to treat it as a - // valid terminal DicNode. - isTerminal = ptNodeParams.getProbability() != NOT_A_PROBABILITY; - } - readingHelper.readNextSiblingNode(ptNodeParams); - if (ptNodeParams.representsNonWordInfo()) { - // Skip PtNodes that represent non-word information. - continue; - } - childDicNodes->pushLeavingChild(dicNode, ptNodeParams.getHeadPos(), - ptNodeParams.getChildrenPos(), ptNodeParams.getProbability(), isTerminal, - ptNodeParams.hasChildren(), - ptNodeParams.isBlacklisted() - || ptNodeParams.isNotAWord() /* isBlacklistedOrNotAWord */, - ptNodeParams.getCodePointCount(), ptNodeParams.getCodePoints()); - } - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); - } -} - -int Ver4PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodePos(ptNodePos); - const int codePointCount = readingHelper.getCodePointsAndProbabilityAndReturnCodePointCount( - maxCodePointCount, outCodePoints, outUnigramProbability); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in getCodePointsAndProbabilityAndReturnCodePointCount()."); - } - return codePointCount; -} - -int Ver4PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, - const int length, const bool forceLowerCaseSearch) const { - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - const int ptNodePos = - readingHelper.getTerminalPtNodePositionOfWord(inWord, length, forceLowerCaseSearch); - if (readingHelper.isError()) { - mIsCorrupted = true; - AKLOGE("Dictionary reading error in createAndGetAllChildDicNodes()."); - } - return ptNodePos; -} - -int Ver4PatriciaTriePolicy::getProbability(const int unigramProbability, - const int bigramProbability) const { - if (mHeaderPolicy->isDecayingDict()) { - // Both probabilities are encoded. Decode them and get probability. - return ForgettingCurveUtils::getProbability(unigramProbability, bigramProbability); - } else { - if (unigramProbability == NOT_A_PROBABILITY) { - return NOT_A_PROBABILITY; - } else if (bigramProbability == NOT_A_PROBABILITY) { - return ProbabilityUtils::backoff(unigramProbability); - } else { - return bigramProbability; - } - } -} - -int Ver4PatriciaTriePolicy::getProbabilityOfPtNode(const int *const prevWordsPtNodePos, - const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_PROBABILITY; - } - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted() || ptNodeParams.isBlacklisted() || ptNodeParams.isNotAWord()) { - return NOT_A_PROBABILITY; - } - if (prevWordsPtNodePos) { - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == ptNodePos - && bigramsIt.getProbability() != NOT_A_PROBABILITY) { - return getProbability(ptNodeParams.getProbability(), bigramsIt.getProbability()); - } - } - return NOT_A_PROBABILITY; - } - return getProbability(ptNodeParams.getProbability(), NOT_A_PROBABILITY); -} - -void Ver4PatriciaTriePolicy::iterateNgramEntries(const int *const prevWordsPtNodePos, - NgramListener *const listener) const { - if (!prevWordsPtNodePos) { - return; - } - const int bigramsPosition = getBigramsPositionOfPtNode(prevWordsPtNodePos[0]); - BinaryDictionaryBigramsIterator bigramsIt(&mBigramPolicy, bigramsPosition); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - listener->onVisitEntry(bigramsIt.getProbability(), bigramsIt.getBigramPos()); - } -} - -int Ver4PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted()) { - return NOT_A_DICT_POS; - } - return mBuffers->getShortcutDictContent()->getShortcutListHeadPos( - ptNodeParams.getTerminalId()); -} - -int Ver4PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { - if (ptNodePos == NOT_A_DICT_POS) { - return NOT_A_DICT_POS; - } - const PtNodeParams ptNodeParams(mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos)); - if (ptNodeParams.isDeleted()) { - return NOT_A_DICT_POS; - } - return mBuffers->getBigramDictContent()->getBigramListHeadPos( - ptNodeParams.getTerminalId()); -} - -bool Ver4PatriciaTriePolicy::addUnigramEntry(const int *const word, const int length, - const UnigramProperty *const unigramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addUnigramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (length > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert to the dictionary, length: %d", length); - return false; - } - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (shortcut.getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("One of shortcut targets is too long to insert to the dictionary, length: %d", - shortcut.getTargetCodePoints()->size()); - return false; - } - } - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - bool addedNewUnigram = false; - int codePointsToAdd[MAX_WORD_LENGTH]; - int codePointCountToAdd = length; - memmove(codePointsToAdd, word, sizeof(int) * length); - if (unigramProperty->representsBeginningOfSentence()) { - codePointCountToAdd = CharUtils::attachBeginningOfSentenceMarker(codePointsToAdd, - codePointCountToAdd, MAX_WORD_LENGTH); - } - if (codePointCountToAdd <= 0) { - return false; - } - if (mUpdatingHelper.addUnigramWord(&readingHelper, codePointsToAdd, codePointCountToAdd, - unigramProperty, &addedNewUnigram)) { - if (addedNewUnigram && !unigramProperty->representsBeginningOfSentence()) { - mUnigramCount++; - } - if (unigramProperty->getShortcuts().size() > 0) { - // Add shortcut target. - const int wordPos = getTerminalPtNodePositionOfWord(word, length, - false /* forceLowerCaseSearch */); - if (wordPos == NOT_A_DICT_POS) { - AKLOGE("Cannot find terminal PtNode position to add shortcut target."); - return false; - } - for (const auto &shortcut : unigramProperty->getShortcuts()) { - if (!mUpdatingHelper.addShortcutTarget(wordPos, - shortcut.getTargetCodePoints()->data(), - shortcut.getTargetCodePoints()->size(), shortcut.getProbability())) { - AKLOGE("Cannot add new shortcut target. PtNodePos: %d, length: %d, " - "probability: %d", wordPos, shortcut.getTargetCodePoints()->size(), - shortcut.getProbability()); - return false; - } - } - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeUnigramEntry(const int *const word, const int length) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeUnigramEntry() is called for non-updatable dictionary."); - return false; - } - const int ptNodePos = getTerminalPtNodePositionOfWord(word, length, - false /* forceLowerCaseSearch */); - if (ptNodePos == NOT_A_DICT_POS) { - return false; - } - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (!mNodeWriter.markPtNodeAsDeleted(&ptNodeParams)) { - AKLOGE("Cannot remove unigram. ptNodePos: %d", ptNodePos); - return false; - } - if (!ptNodeParams.representsNonWordInfo()) { - mUnigramCount--; - } - return true; -} - -bool Ver4PatriciaTriePolicy::addNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const BigramProperty *const bigramProperty) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: addNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (!prevWordsInfo->isValid()) { - AKLOGE("prev words info is not valid for adding n-gram entry to the dictionary."); - return false; - } - if (bigramProperty->getTargetCodePoints()->size() > MAX_WORD_LENGTH) { - AKLOGE("The word is too long to insert the ngram to the dictionary. " - "length: %d", bigramProperty->getTargetCodePoints()->size()); - return false; - } - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, - false /* tryLowerCaseSearch */); - const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos); - // TODO: Support N-gram. - if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { - if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */)) { - const std::vector<UnigramProperty::ShortcutProperty> shortcuts; - const UnigramProperty beginningOfSentenceUnigramProperty( - true /* representsBeginningOfSentence */, true /* isNotAWord */, - false /* isBlacklisted */, MAX_PROBABILITY /* probability */, - NOT_A_TIMESTAMP /* timestamp */, 0 /* level */, 0 /* count */, &shortcuts); - if (!addUnigramEntry(prevWordsInfo->getNthPrevWordCodePoints(1 /* n */), - prevWordsInfo->getNthPrevWordCodePointCount(1 /* n */), - &beginningOfSentenceUnigramProperty)) { - AKLOGE("Cannot add unigram entry for the beginning-of-sentence."); - return false; - } - // Refresh Terminal PtNode positions. - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, - false /* tryLowerCaseSearch */); - } else { - return false; - } - } - const int word1Pos = getTerminalPtNodePositionOfWord( - bigramProperty->getTargetCodePoints()->data(), - bigramProperty->getTargetCodePoints()->size(), false /* forceLowerCaseSearch */); - if (word1Pos == NOT_A_DICT_POS) { - return false; - } - bool addedNewEntry = false; - if (mUpdatingHelper.addNgramEntry(prevWordsPtNodePosView, word1Pos, bigramProperty, - &addedNewEntry)) { - if (addedNewEntry) { - mBigramCount++; - } - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::removeNgramEntry(const PrevWordsInfo *const prevWordsInfo, - const int *const word, const int length) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: removeNgramEntry() is called for non-updatable dictionary."); - return false; - } - if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS) { - AKLOGE("The dictionary is too large to dynamically update. Dictionary size: %d", - mDictBuffer->getTailPosition()); - return false; - } - if (!prevWordsInfo->isValid()) { - AKLOGE("prev words info is not valid for removing n-gram entry form the dictionary."); - return false; - } - if (length > MAX_WORD_LENGTH) { - AKLOGE("word is too long to remove n-gram entry form the dictionary. length: %d", length); - } - int prevWordsPtNodePos[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - prevWordsInfo->getPrevWordsTerminalPtNodePos(this, prevWordsPtNodePos, - false /* tryLowerCaseSerch */); - const auto prevWordsPtNodePosView = PtNodePosArrayView::fromFixedSizeArray(prevWordsPtNodePos); - // TODO: Support N-gram. - if (prevWordsPtNodePos[0] == NOT_A_DICT_POS) { - return false; - } - const int wordPos = getTerminalPtNodePositionOfWord(word, length, - false /* forceLowerCaseSearch */); - if (wordPos == NOT_A_DICT_POS) { - return false; - } - if (mUpdatingHelper.removeNgramEntry(prevWordsPtNodePosView, wordPos)) { - mBigramCount--; - return true; - } else { - return false; - } -} - -bool Ver4PatriciaTriePolicy::flush(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flush() is called for non-updatable dictionary. filePath: %s", filePath); - return false; - } - if (!mWritingHelper.writeToDictFile(filePath, mUnigramCount, mBigramCount)) { - AKLOGE("Cannot flush the dictionary to file."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::flushWithGC(const char *const filePath) { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); - return false; - } - if (!mWritingHelper.writeToDictFileWithGC(getRootPosition(), filePath)) { - AKLOGE("Cannot flush the dictionary to file with GC."); - mIsCorrupted = true; - return false; - } - return true; -} - -bool Ver4PatriciaTriePolicy::needsToRunGC(const bool mindsBlockByGC) const { - if (!mBuffers->isUpdatable()) { - AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); - return false; - } - if (mBuffers->isNearSizeLimit()) { - // Additional buffer size is near the limit. - return true; - } else if (mHeaderPolicy->getExtendedRegionSize() + mDictBuffer->getUsedAdditionalBufferSize() - > Ver4DictConstants::MAX_DICT_EXTENDED_REGION_SIZE) { - // Total extended region size of the trie exceeds the limit. - return true; - } else if (mDictBuffer->getTailPosition() >= MIN_DICT_SIZE_TO_REFUSE_DYNAMIC_OPERATIONS - && mDictBuffer->getUsedAdditionalBufferSize() > 0) { - // Needs to reduce dictionary size. - return true; - } else if (mHeaderPolicy->isDecayingDict()) { - return ForgettingCurveUtils::needsToDecay(mindsBlockByGC, mUnigramCount, mBigramCount, - mHeaderPolicy); - } - return false; -} - -void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int queryLength, - char *const outResult, const int maxResultLength) { - const int compareLength = queryLength + 1 /* terminator */; - if (strncmp(query, UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mUnigramCount); - } else if (strncmp(query, BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", mBigramCount); - } else if (strncmp(query, MAX_UNIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getUnigramCountHardLimit( - mHeaderPolicy->getMaxUnigramCount()) : - static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } else if (strncmp(query, MAX_BIGRAM_COUNT_QUERY, compareLength) == 0) { - snprintf(outResult, maxResultLength, "%d", - mHeaderPolicy->isDecayingDict() ? - ForgettingCurveUtils::getBigramCountHardLimit( - mHeaderPolicy->getMaxBigramCount()) : - static_cast<int>(Ver4DictConstants::MAX_DICTIONARY_SIZE)); - } -} - -const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, - const int codePointCount) const { - const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, - false /* forceLowerCaseSearch */); - if (ptNodePos == NOT_A_DICT_POS) { - AKLOGE("getWordProperty is called for invalid word."); - return WordProperty(); - } - const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - std::vector<int> codePointVector(ptNodeParams.getCodePoints(), - ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); - const ProbabilityEntry probabilityEntry = - mBuffers->getLanguageModelDictContent()->getProbabilityEntry( - ptNodeParams.getTerminalId()); - const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); - // Fetch bigram information. - std::vector<BigramProperty> bigrams; - const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); - if (bigramListPos != NOT_A_DICT_POS) { - int bigramWord1CodePoints[MAX_WORD_LENGTH]; - const BigramDictContent *const bigramDictContent = mBuffers->getBigramDictContent(); - const TerminalPositionLookupTable *const terminalPositionLookupTable = - mBuffers->getTerminalPositionLookupTable(); - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - hasNext = bigramEntry.hasNext(); - const int word1TerminalId = bigramEntry.getTargetTerminalId(); - const int word1TerminalPtNodePos = - terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); - if (word1TerminalPtNodePos == NOT_A_DICT_POS) { - continue; - } - // Word (unigram) probability - int word1Probability = NOT_A_PROBABILITY; - const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, - &word1Probability); - const std::vector<int> word1(bigramWord1CodePoints, - bigramWord1CodePoints + codePointCount); - const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); - const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - bigramEntry.getHistoricalInfo(), mHeaderPolicy) : - bigramEntry.getProbability(); - bigrams.emplace_back(&word1, probability, - historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount()); - } - } - // Fetch shortcut information. - std::vector<UnigramProperty::ShortcutProperty> shortcuts; - int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); - if (shortcutPos != NOT_A_DICT_POS) { - int shortcutTarget[MAX_WORD_LENGTH]; - const ShortcutDictContent *const shortcutDictContent = - mBuffers->getShortcutDictContent(); - bool hasNext = true; - while (hasNext) { - int shortcutTargetLength = 0; - int shortcutProbability = NOT_A_PROBABILITY; - shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, - &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); - const std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); - shortcuts.emplace_back(&target, shortcutProbability); - } - } - const UnigramProperty unigramProperty(ptNodeParams.representsBeginningOfSentence(), - ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.getProbability(), - historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount(), &shortcuts); - return WordProperty(&codePointVector, &unigramProperty, &bigrams); -} - -int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, int *const outCodePoints, - int *const outCodePointCount) { - *outCodePointCount = 0; - if (token == 0) { - mTerminalPtNodePositionsForIteratingWords.clear(); - DynamicPtReadingHelper::TraversePolicyToGetAllTerminalPtNodePositions traversePolicy( - &mTerminalPtNodePositionsForIteratingWords); - DynamicPtReadingHelper readingHelper(&mNodeReader, &mPtNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(getRootPosition()); - readingHelper.traverseAllPtNodesInPostorderDepthFirstManner(&traversePolicy); - } - const int terminalPtNodePositionsVectorSize = - static_cast<int>(mTerminalPtNodePositionsForIteratingWords.size()); - if (token < 0 || token >= terminalPtNodePositionsVectorSize) { - AKLOGE("Given token %d is invalid.", token); - return 0; - } - const int terminalPtNodePos = mTerminalPtNodePositionsForIteratingWords[token]; - int unigramProbability = NOT_A_PROBABILITY; - *outCodePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( - terminalPtNodePos, MAX_WORD_LENGTH, outCodePoints, &unigramProbability); - const int nextToken = token + 1; - if (nextToken >= terminalPtNodePositionsVectorSize) { - // All words have been iterated. - mTerminalPtNodePositionsForIteratingWords.clear(); - return 0; - } - return nextToken; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp deleted file mode 100644 index 4220312e0..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.h" - -#include <cstring> -#include <queue> - -#include "suggest/policyimpl/dictionary/header/header_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/bigram/ver4_bigram_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/shortcut/ver4_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_buffers.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_dict_constants.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_writer.h" -#include "suggest/policyimpl/dictionary/structure/v4/ver4_pt_node_array_reader.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" -#include "suggest/policyimpl/dictionary/utils/file_utils.h" -#include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" - -namespace latinime { - -bool Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPath, - const int unigramCount, const int bigramCount) const { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - const int extendedRegionSize = headerPolicy->getExtendedRegionSize() - + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, - unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { - AKLOGE("Cannot write header structure to buffer. " - "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, unigramCount, bigramCount, - extendedRegionSize); - return false; - } - return mBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, - const char *const dictDirPath) { - const HeaderPolicy *const headerPolicy = mBuffers->getHeaderPolicy(); - Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers( - Ver4DictBuffers::createVer4DictBuffers(headerPolicy, - Ver4DictConstants::MAX_DICTIONARY_SIZE)); - int unigramCount = 0; - int bigramCount = 0; - if (!runGC(rootPtNodeArrayPos, headerPolicy, dictBuffers.get(), &unigramCount, &bigramCount)) { - return false; - } - BufferWithExtendableBuffer headerBuffer( - BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, - unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { - return false; - } - return dictBuffers->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); -} - -bool Ver4PatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, - const HeaderPolicy *const headerPolicy, Ver4DictBuffers *const buffersToWrite, - int *const outUnigramCount, int *const outBigramCount) { - Ver4PatriciaTrieNodeReader ptNodeReader(mBuffers->getTrieBuffer(), - mBuffers->getLanguageModelDictContent(), headerPolicy); - Ver4PtNodeArrayReader ptNodeArrayReader(mBuffers->getTrieBuffer()); - Ver4BigramListPolicy bigramPolicy(mBuffers->getMutableBigramDictContent(), - mBuffers->getTerminalPositionLookupTable(), headerPolicy); - Ver4ShortcutListPolicy shortcutPolicy(mBuffers->getMutableShortcutDictContent(), - mBuffers->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter ptNodeWriter(mBuffers->getWritableTrieBuffer(), - mBuffers, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, - &shortcutPolicy); - - DynamicPtReadingHelper readingHelper(&ptNodeReader, &ptNodeArrayReader); - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners - ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( - &ptNodeWriter); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { - return false; - } - const int unigramCount = traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted - .getValidUnigramCount(); - const int maxUnigramCount = headerPolicy->getMaxUnigramCount(); - if (headerPolicy->isDecayingDict() && unigramCount > maxUnigramCount) { - if (!truncateUnigrams(&ptNodeReader, &ptNodeWriter, maxUnigramCount)) { - AKLOGE("Cannot remove unigrams. current: %d, max: %d", unigramCount, - maxUnigramCount); - return false; - } - } - - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners::TraversePolicyToUpdateBigramProbability - traversePolicyToUpdateBigramProbability(&ptNodeWriter); - if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateBigramProbability)) { - return false; - } - const int bigramCount = traversePolicyToUpdateBigramProbability.getValidBigramEntryCount(); - const int maxBigramCount = headerPolicy->getMaxBigramCount(); - if (headerPolicy->isDecayingDict() && bigramCount > maxBigramCount) { - if (!truncateBigrams(maxBigramCount)) { - AKLOGE("Cannot remove bigrams. current: %d, max: %d", bigramCount, maxBigramCount); - return false; - } - } - - // Mapping from positions in mBuffer to positions in bufferToWrite. - PtNodeWriter::DictPositionRelocationMap dictPositionRelocationMap; - readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - Ver4PatriciaTrieNodeWriter ptNodeWriterForNewBuffers(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, headerPolicy, &ptNodeReader, &ptNodeArrayReader, &bigramPolicy, - &shortcutPolicy); - DynamicPtGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer - traversePolicyToPlaceAndWriteValidPtNodesToBuffer(&ptNodeWriterForNewBuffers, - buffersToWrite->getWritableTrieBuffer(), &dictPositionRelocationMap); - if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { - return false; - } - - // Create policy instances for the GCed dictionary. - Ver4PatriciaTrieNodeReader newPtNodeReader(buffersToWrite->getTrieBuffer(), - buffersToWrite->getLanguageModelDictContent(), headerPolicy); - Ver4PtNodeArrayReader newPtNodeArrayreader(buffersToWrite->getTrieBuffer()); - Ver4BigramListPolicy newBigramPolicy(buffersToWrite->getMutableBigramDictContent(), - buffersToWrite->getTerminalPositionLookupTable(), headerPolicy); - Ver4ShortcutListPolicy newShortcutPolicy(buffersToWrite->getMutableShortcutDictContent(), - buffersToWrite->getTerminalPositionLookupTable()); - Ver4PatriciaTrieNodeWriter newPtNodeWriter(buffersToWrite->getWritableTrieBuffer(), - buffersToWrite, headerPolicy, &newPtNodeReader, &newPtNodeArrayreader, &newBigramPolicy, - &newShortcutPolicy); - // Re-assign terminal IDs for valid terminal PtNodes. - TerminalPositionLookupTable::TerminalIdMap terminalIdMap; - if(!buffersToWrite->getMutableTerminalPositionLookupTable()->runGCTerminalIds( - &terminalIdMap)) { - return false; - } - // Run GC for probability dict content. - if (!buffersToWrite->getMutableLanguageModelDictContent()->runGC(&terminalIdMap, - mBuffers->getLanguageModelDictContent(), nullptr /* outNgramCount */)) { - return false; - } - // Run GC for bigram dict content. - if(!buffersToWrite->getMutableBigramDictContent()->runGC(&terminalIdMap, - mBuffers->getBigramDictContent(), outBigramCount)) { - return false; - } - // Run GC for shortcut dict content. - if(!buffersToWrite->getMutableShortcutDictContent()->runGC(&terminalIdMap, - mBuffers->getShortcutDictContent())) { - return false; - } - DynamicPtReadingHelper newDictReadingHelper(&newPtNodeReader, &newPtNodeArrayreader); - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - DynamicPtGcEventListeners::TraversePolicyToUpdateAllPositionFields - traversePolicyToUpdateAllPositionFields(&newPtNodeWriter, &dictPositionRelocationMap); - if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( - &traversePolicyToUpdateAllPositionFields)) { - return false; - } - newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); - TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds(&newPtNodeWriter, &terminalIdMap); - if (!newDictReadingHelper.traverseAllPtNodesInPostorderDepthFirstManner( - &traversePolicyToUpdateAllPtNodeFlagsAndTerminalIds)) { - return false; - } - *outUnigramCount = traversePolicyToUpdateAllPositionFields.getUnigramCount(); - return true; -} - -bool Ver4PatriciaTrieWritingHelper::truncateUnigrams( - const Ver4PatriciaTrieNodeReader *const ptNodeReader, - Ver4PatriciaTrieNodeWriter *const ptNodeWriter, const int maxUnigramCount) { - const TerminalPositionLookupTable *const terminalPosLookupTable = - mBuffers->getTerminalPositionLookupTable(); - const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); - std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> - priorityQueue; - for (int i = 0; i < nextTerminalId; ++i) { - const int terminalPos = terminalPosLookupTable->getTerminalPtNodePosition(i); - if (terminalPos == NOT_A_DICT_POS) { - continue; - } - const ProbabilityEntry probabilityEntry = - mBuffers->getLanguageModelDictContent()->getProbabilityEntry(i); - const int probability = probabilityEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - probabilityEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : - probabilityEntry.getProbability(); - priorityQueue.push(DictProbability(terminalPos, probability, - probabilityEntry.getHistoricalInfo()->getTimeStamp())); - } - - // Delete unigrams. - while (static_cast<int>(priorityQueue.size()) > maxUnigramCount) { - const int ptNodePos = priorityQueue.top().getDictPos(); - priorityQueue.pop(); - const PtNodeParams ptNodeParams = - ptNodeReader->fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos); - if (ptNodeParams.representsNonWordInfo()) { - continue; - } - if (!ptNodeWriter->markPtNodeAsWillBecomeNonTerminal(&ptNodeParams)) { - AKLOGE("Cannot mark PtNode as willBecomeNonterminal. PtNode pos: %d", ptNodePos); - return false; - } - } - return true; -} - -bool Ver4PatriciaTrieWritingHelper::truncateBigrams(const int maxBigramCount) { - const TerminalPositionLookupTable *const terminalPosLookupTable = - mBuffers->getTerminalPositionLookupTable(); - const int nextTerminalId = terminalPosLookupTable->getNextTerminalId(); - std::priority_queue<DictProbability, std::vector<DictProbability>, DictProbabilityComparator> - priorityQueue; - BigramDictContent *const bigramDictContent = mBuffers->getMutableBigramDictContent(); - for (int i = 0; i < nextTerminalId; ++i) { - const int bigramListPos = bigramDictContent->getBigramListHeadPos(i); - if (bigramListPos == NOT_A_DICT_POS) { - continue; - } - bool hasNext = true; - int readingPos = bigramListPos; - while (hasNext) { - const BigramEntry bigramEntry = - bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); - const int entryPos = readingPos - bigramDictContent->getBigramEntrySize(); - hasNext = bigramEntry.hasNext(); - if (!bigramEntry.isValid()) { - continue; - } - const int probability = bigramEntry.hasHistoricalInfo() ? - ForgettingCurveUtils::decodeProbability( - bigramEntry.getHistoricalInfo(), mBuffers->getHeaderPolicy()) : - bigramEntry.getProbability(); - priorityQueue.push(DictProbability(entryPos, probability, - bigramEntry.getHistoricalInfo()->getTimeStamp())); - } - } - - // Delete bigrams. - while (static_cast<int>(priorityQueue.size()) > maxBigramCount) { - const int entryPos = priorityQueue.top().getDictPos(); - const BigramEntry bigramEntry = bigramDictContent->getBigramEntry(entryPos); - const BigramEntry invalidatedBigramEntry = bigramEntry.getInvalidatedEntry(); - if (!bigramDictContent->writeBigramEntry(&invalidatedBigramEntry, entryPos)) { - AKLOGE("Cannot write bigram entry to remove. pos: %d", entryPos); - return false; - } - priorityQueue.pop(); - } - return true; -} - -bool Ver4PatriciaTrieWritingHelper::TraversePolicyToUpdateAllPtNodeFlagsAndTerminalIds - ::onVisitingPtNode(const PtNodeParams *const ptNodeParams) { - if (!ptNodeParams->isTerminal()) { - return true; - } - TerminalPositionLookupTable::TerminalIdMap::const_iterator it = - mTerminalIdMap->find(ptNodeParams->getTerminalId()); - if (it == mTerminalIdMap->end()) { - AKLOGE("terminal Id %d is not in the terminal position map. map size: %zd", - ptNodeParams->getTerminalId(), mTerminalIdMap->size()); - return false; - } - if (!mPtNodeWriter->updateTerminalId(ptNodeParams, it->second)) { - AKLOGE("Cannot update terminal id. %d -> %d", it->first, it->second); - return false; - } - return true; -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index 3fc566e7a..856808a74 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -24,6 +24,7 @@ const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; const float ScoringParams::EXACT_MATCH_PROMOTION = 1.1f; +const float ScoringParams::PERFECT_MATCH_PROMOTION = 1.1f; const float ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH = 0.01f; const float ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH = 0.02f; const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; @@ -31,6 +32,7 @@ const float ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH = 0.03f; // TODO: Unlimit max cache dic node size const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE = 50; const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.1524f; @@ -47,18 +49,21 @@ const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.5508f; const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.674f; const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.639f; const float ScoringParams::TRANSPOSITION_COST = 0.5608f; -const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.334f; -const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.4576f; +const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.33f; +const float ScoringParams::SPACE_OMISSION_COST = 0.1f; +const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.37972f; const float ScoringParams::SUBSTITUTION_COST = 0.3806f; -const float ScoringParams::COST_NEW_WORD = 0.0314f; const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.3224f; const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.1214f; const float ScoringParams::COST_FIRST_COMPLETION = 0.4836f; const float ScoringParams::COST_COMPLETION = 0.00624f; const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.0683f; const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.0362f; -const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.4182f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.3482f; const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.095f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION = 0.99f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION = 0.99f; +const float ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE = 0.99f; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index b12de6d87..6f327a370 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -30,9 +30,11 @@ class ScoringParams { static const float AUTOCORRECT_OUTPUT_THRESHOLD; static const int MAX_CACHE_DIC_NODE_SIZE; static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; + static const int MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE; static const int THRESHOLD_SHORT_WORD_LENGTH; static const float EXACT_MATCH_PROMOTION; + static const float PERFECT_MATCH_PROMOTION; static const float CASE_ERROR_PENALTY_FOR_EXACT_MATCH; static const float ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; static const float DIGRAPH_PENALTY_FOR_EXACT_MATCH; @@ -55,9 +57,9 @@ class ScoringParams { static const float INSERTION_COST_FIRST_CHAR; static const float TRANSPOSITION_COST; static const float SPACE_SUBSTITUTION_COST; + static const float SPACE_OMISSION_COST; static const float ADDITIONAL_PROXIMITY_COST; static const float SUBSTITUTION_COST; - static const float COST_NEW_WORD; static const float COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE; static const float DISTANCE_WEIGHT_LANGUAGE; static const float COST_FIRST_COMPLETION; @@ -68,6 +70,9 @@ class ScoringParams { static const float TYPING_BASE_OUTPUT_SCORE; static const float TYPING_MAX_OUTPUT_SCORE_PER_INPUT; static const float NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION; + static const float LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE; private: DISALLOW_IMPLICIT_CONSTRUCTORS(ScoringParams); diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 04cb6603a..6acd767ea 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -33,32 +33,61 @@ class TypingScoring : public Scoring { static const TypingScoring *getInstance() { return &sInstance; } AK_FORCE_INLINE void getMostProbableString(const DicTraverseSession *const traverseSession, - const float languageWeight, SuggestionResults *const outSuggestionResults) const {} + const float weightOfLangModelVsSpatialModel, + SuggestionResults *const outSuggestionResults) const {} - AK_FORCE_INLINE float getAdjustedLanguageWeight(DicTraverseSession *const traverseSession, - DicNode *const terminals, const int size) const { + AK_FORCE_INLINE float getAdjustedWeightOfLangModelVsSpatialModel( + DicTraverseSession *const traverseSession, DicNode *const terminals, + const int size) const { return 1.0f; } AK_FORCE_INLINE int calculateFinalScore(const float compoundDistance, const int inputSize, const ErrorTypeUtils::ErrorType containedErrorTypes, const bool forceCommit, - const bool boostExactMatches) const { + const bool boostExactMatches, const bool hasProbabilityZero) const { const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE - compoundDistance / maxDistance; if (forceCommit) { score += ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD; } - if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { - score += ScoringParams::EXACT_MATCH_PROMOTION; - if ((ErrorTypeUtils::MATCH_WITH_CASE_ERROR & containedErrorTypes) != 0) { - score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + if (hasProbabilityZero) { + // Previously, when both legitimate 0-frequency words (such as distracters) and + // offensive words were encoded in the same way, distracters would never show up + // when the user blocked offensive words (the default setting, as well as the + // setting for regression tests). + // + // When b/11031090 was fixed and a separate encoding was used for offensive words, + // 0-frequency words would no longer be blocked when they were an "exact match" + // (where case mismatches and accent mismatches would be considered an "exact + // match"). The exact match boosting functionality meant that, for example, when + // the user typed "mt" they would be suggested the word "Mt", although they most + // probably meant to type "my". + // + // For this reason, we introduced this change, which does the following: + // * Defines the "perfect match" as a really exact match, with no room for case or + // accent mismatches + // * When the target word has probability zero (as "Mt" does, because it is a + // distracter), ONLY boost its score if it is a perfect match. + // + // By doing this, when the user types "mt", the word "Mt" will NOT be boosted, and + // they will get "my". However, if the user makes an explicit effort to type "Mt", + // we do boost the word "Mt" so that the user's input is not autocorrected to "My". + if (boostExactMatches && ErrorTypeUtils::isPerfectMatch(containedErrorTypes)) { + score += ScoringParams::PERFECT_MATCH_PROMOTION; } - if ((ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR & containedErrorTypes) != 0) { - score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; - } - if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { - score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } else { + if (boostExactMatches && ErrorTypeUtils::isExactMatch(containedErrorTypes)) { + score += ScoringParams::EXACT_MATCH_PROMOTION; + if ((ErrorTypeUtils::MATCH_WITH_WRONG_CASE & containedErrorTypes) != 0) { + score -= ScoringParams::CASE_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT & containedErrorTypes) != 0) { + score -= ScoringParams::ACCENT_ERROR_PENALTY_FOR_EXACT_MATCH; + } + if ((ErrorTypeUtils::MATCH_WITH_DIGRAPH & containedErrorTypes) != 0) { + score -= ScoringParams::DIGRAPH_PENALTY_FOR_EXACT_MATCH; + } } } return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h index cb3dfac70..b9b6314ae 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -26,6 +26,7 @@ #include "suggest/core/layout/proximity_info_utils.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/suggest_options.h" #include "suggest/policyimpl/typing/scoring_params.h" #include "utils/char_utils.h" @@ -77,6 +78,13 @@ class TypingTraversal : public Traversal { if (!CORRECT_NEW_WORD_SPACE_SUBSTITUTION) { return false; } + if (traverseSession->getSuggestOptions()->weightForLocale() + < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_SUBSTITUTION) { + // Space substitution is heavy, so we skip doing it if the weight for this language + // is low because we anticipate the suggestions out of this dictionary are not for + // the language the user intends to type in. + return false; + } if (!canDoLookAheadCorrection(traverseSession, dicNode)) { return false; } @@ -91,6 +99,13 @@ class TypingTraversal : public Traversal { if (!CORRECT_NEW_WORD_SPACE_OMISSION) { return false; } + if (traverseSession->getSuggestOptions()->weightForLocale() + < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SPACE_OMISSION) { + // Space omission is heavy, so we skip doing it if the weight for this language + // is low because we anticipate the suggestions out of this dictionary are not for + // the language the user intends to type in. + return false; + } const int inputSize = traverseSession->getInputSize(); // TODO: Don't refer to isCompletion? if (dicNode->isCompletion(inputSize)) { @@ -141,9 +156,14 @@ class TypingTraversal : public Traversal { return DicNodeVector::DEFAULT_NODES_SIZE_FOR_OPTIMIZATION; } - AK_FORCE_INLINE int getMaxCacheSize(const int inputSize) const { - return (inputSize <= 1) ? ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT - : ScoringParams::MAX_CACHE_DIC_NODE_SIZE; + AK_FORCE_INLINE int getMaxCacheSize(const int inputSize, const float weightForLocale) const { + if (inputSize <= 1) { + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; + } + if (weightForLocale < ScoringParams::LOCALE_WEIGHT_THRESHOLD_FOR_SMALL_CACHE_SIZE) { + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_LOW_PROBABILITY_LOCALE; + } + return ScoringParams::MAX_CACHE_DIC_NODE_SIZE; } AK_FORCE_INLINE int getTerminalCacheSize() const { @@ -161,8 +181,8 @@ class TypingTraversal : public Traversal { return true; } - AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode) const { - const int probability = dicNode->getProbability(); + AK_FORCE_INLINE bool isGoodToTraverseNextWord(const DicNode *const dicNode, + const int probability) const { if (probability < ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY) { return false; } diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp index 54f65c786..a0e54115d 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp @@ -17,6 +17,7 @@ #include "suggest/policyimpl/typing/typing_weighting.h" #include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/layout/proximity_info.h" #include "suggest/policyimpl/typing/scoring_params.h" namespace latinime { @@ -36,30 +37,49 @@ ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType cor // Compare the node code point with original primary code point on the keyboard. const ProximityInfoState *const pInfoState = traverseSession->getProximityInfoState(0); - const int primaryOriginalCodePoint = pInfoState->getPrimaryOriginalCodePointAt( + const int primaryCodePoint = pInfoState->getPrimaryCodePointAt( dicNode->getInputIndex(0)); const int nodeCodePoint = dicNode->getNodeCodePoint(); - if (primaryOriginalCodePoint == nodeCodePoint) { + const int keyIndex = traverseSession->getProximityInfo()->getKeyIndexOf( + primaryCodePoint); + // TODO: Check whether the input code point is on the keyboard. + if (primaryCodePoint == nodeCodePoint) { // Node code point is same as original code point on the keyboard. return ErrorTypeUtils::NOT_AN_ERROR; - } else if (CharUtils::toLowerCase(primaryOriginalCodePoint) == + } else if (CharUtils::toLowerCase(primaryCodePoint) == CharUtils::toLowerCase(nodeCodePoint)) { // Only cases of the code points are different. - return ErrorTypeUtils::MATCH_WITH_CASE_ERROR; - } else if (CharUtils::toBaseCodePoint(primaryOriginalCodePoint) == - CharUtils::toBaseCodePoint(nodeCodePoint)) { + return ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else if (primaryCodePoint == CharUtils::toBaseCodePoint(nodeCodePoint)) { // Node code point is a variant of original code point. - return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR; - } else { + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT; + } else if (CharUtils::toBaseCodePoint(primaryCodePoint) + == CharUtils::toBaseCodePoint(nodeCodePoint)) { + // Base code points are the same but the code point is intentionally input. + if (keyIndex == NOT_AN_INDEX) { + return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT; + } + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT; + } else if (CharUtils::toLowerCase(primaryCodePoint) + == CharUtils::toBaseLowerCase(nodeCodePoint)) { // Node code point is a variant of original code point and the cases are also // different. - return ErrorTypeUtils::MATCH_WITH_ACCENT_ERROR - | ErrorTypeUtils::MATCH_WITH_CASE_ERROR; + return ErrorTypeUtils::MATCH_WITH_MISSING_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } else { + if (keyIndex == NOT_AN_INDEX) { + return ErrorTypeUtils::MATCH_WITH_MISSING_EXPLICIT_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; + } + // Base code points are the same and the cases are different. + return ErrorTypeUtils::MATCH_WITH_WRONG_ACCENT + | ErrorTypeUtils::MATCH_WITH_WRONG_CASE; } } break; case CT_ADDITIONAL_PROXIMITY: - return ErrorTypeUtils::PROXIMITY_CORRECTION; + // TODO: Change to EDIT_CORRECTION. + return ErrorTypeUtils::PROXIMITY_CORRECTION; case CT_OMISSION: if (parentDicNode->canBeIntentionalOmission()) { return ErrorTypeUtils::INTENTIONAL_OMISSION; @@ -68,6 +88,8 @@ ErrorTypeUtils::ErrorType TypingWeighting::getErrorType(const CorrectionType cor } break; case CT_SUBSTITUTION: + // TODO: Quit settng PROXIMITY_CORRECTION. + return ErrorTypeUtils::EDIT_CORRECTION | ErrorTypeUtils::PROXIMITY_CORRECTION; case CT_INSERTION: case CT_TERMINAL_INSERTION: case CT_TRANSPOSITION: diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index 84077174d..1338ac81a 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -150,9 +150,10 @@ class TypingWeighting : public Weighting { return cost + weightedDistance; } - float getNewWordSpatialCost(const DicTraverseSession *const traverseSession, + float getSpaceOmissionCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { - return ScoringParams::COST_NEW_WORD * traverseSession->getMultiWordCostMultiplier(); + const float cost = ScoringParams::SPACE_OMISSION_COST; + return cost * traverseSession->getMultiWordCostMultiplier(); } float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession, @@ -202,7 +203,10 @@ class TypingWeighting : public Weighting { AK_FORCE_INLINE float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { - const float cost = ScoringParams::SPACE_SUBSTITUTION_COST + ScoringParams::COST_NEW_WORD; + const int inputIndex = dicNode->getInputIndex(0); + const float distanceToSpaceKey = traverseSession->getProximityInfoState(0) + ->getPointToKeyLength(inputIndex, KEYCODE_SPACE); + const float cost = ScoringParams::SPACE_SUBSTITUTION_COST * distanceToSpaceKey; return cost * traverseSession->getMultiWordCostMultiplier(); } diff --git a/native/jni/src/utils/byte_array_view.h b/native/jni/src/utils/byte_array_view.h index 2c97c6d58..2b778af6f 100644 --- a/native/jni/src/utils/byte_array_view.h +++ b/native/jni/src/utils/byte_array_view.h @@ -42,6 +42,13 @@ class ReadOnlyByteArrayView { return mPtr; } + AK_FORCE_INLINE const ReadOnlyByteArrayView skip(const size_t n) const { + if (mSize <= n) { + return ReadOnlyByteArrayView(); + } + return ReadOnlyByteArrayView(mPtr + n, mSize - n); + } + private: DISALLOW_ASSIGNMENT_OPERATOR(ReadOnlyByteArrayView); @@ -77,10 +84,12 @@ class ReadWriteByteArrayView { } private: - DISALLOW_ASSIGNMENT_OPERATOR(ReadWriteByteArrayView); + // Default copy constructor and assignment operator are used for using this class with STL + // containers. - uint8_t *const mPtr; - const size_t mSize; + // These members cannot be const to have the assignment operator. + uint8_t *mPtr; + size_t mSize; }; } // namespace latinime diff --git a/native/jni/src/utils/char_utils.cpp b/native/jni/src/utils/char_utils.cpp index b17e0847d..a43e6dd62 100644 --- a/native/jni/src/utils/char_utils.cpp +++ b/native/jni/src/utils/char_utils.cpp @@ -1057,11 +1057,11 @@ static int compare_pair_capital(const void *a, const void *b) { - static_cast<int>((static_cast<const struct LatinCapitalSmallPair *>(b))->capital); } -/* static */ unsigned short CharUtils::latin_tolower(const unsigned short c) { +/* static */ int CharUtils::latin_tolower(const int c) { struct LatinCapitalSmallPair *p = static_cast<struct LatinCapitalSmallPair *>(bsearch(&c, SORTED_CHAR_MAP, NELEMS(SORTED_CHAR_MAP), sizeof(SORTED_CHAR_MAP[0]), compare_pair_capital)); - return p ? p->small : c; + return p ? static_cast<int>(p->small) : c; } /* @@ -1117,7 +1117,9 @@ static int compare_pair_capital(const void *a, const void *b) { // TODO: Check if it's really acceptable to consider ΓΈ a diacritical variant of o /* U+0100 */ 0x0041, 0x0061, 0x0041, 0x0061, 0x0041, 0x0061, 0x0043, 0x0063, /* U+0108 */ 0x0043, 0x0063, 0x0043, 0x0063, 0x0043, 0x0063, 0x0044, 0x0064, - /* U+0110 */ 0x0110, 0x0111, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, + /* U+0110 */ 0x0046, 0x0064, 0x0045, 0x0065, 0x0045, 0x0065, 0x0045, 0x0065, + // U+0110: Manually changed from 0110 to 0046 + // U+0111: Manually changed from 0111 to 0064 /* U+0118 */ 0x0045, 0x0065, 0x0045, 0x0065, 0x0047, 0x0067, 0x0047, 0x0067, /* U+0120 */ 0x0047, 0x0067, 0x0047, 0x0067, 0x0048, 0x0068, 0x0126, 0x0127, /* U+0128 */ 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, 0x0049, 0x0069, @@ -1135,6 +1137,9 @@ static int compare_pair_capital(const void *a, const void *b) { /* U+0170 */ 0x0055, 0x0075, 0x0055, 0x0075, 0x0057, 0x0077, 0x0059, 0x0079, /* U+0178 */ 0x0059, 0x005A, 0x007A, 0x005A, 0x007A, 0x005A, 0x007A, 0x0073, /* U+0180 */ 0x0180, 0x0181, 0x0182, 0x0183, 0x0184, 0x0185, 0x0186, 0x0187, + // TODO: A lot of letters are their own base code points, but for + // some (e.g. U+0180) it doesn't seem right. Ideally each code point should + // be checked individually with all languages it's used in. /* U+0188 */ 0x0188, 0x0189, 0x018A, 0x018B, 0x018C, 0x018D, 0x018E, 0x018F, /* U+0190 */ 0x0190, 0x0191, 0x0192, 0x0193, 0x0194, 0x0195, 0x0196, 0x0197, /* U+0198 */ 0x0198, 0x0199, 0x019A, 0x019B, 0x019C, 0x019D, 0x019E, 0x019F, diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h index 63786502b..7871c26ef 100644 --- a/native/jni/src/utils/char_utils.h +++ b/native/jni/src/utils/char_utils.h @@ -27,20 +27,14 @@ namespace latinime { class CharUtils { public: + static const std::vector<int> EMPTY_STRING; + static AK_FORCE_INLINE bool isAsciiUpper(int c) { // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...). return (c >= 'A' && c <= 'Z'); } - static AK_FORCE_INLINE int toAsciiLower(int c) { - return c - 'A' + 'a'; - } - - static AK_FORCE_INLINE bool isAscii(int c) { - return isascii(c) != 0; - } - static AK_FORCE_INLINE int toLowerCase(const int c) { if (isAsciiUpper(c)) { return toAsciiLower(c); @@ -48,7 +42,7 @@ class CharUtils { if (isAscii(c)) { return c; } - return static_cast<int>(latin_tolower(static_cast<unsigned short>(c))); + return latin_tolower(c); } static AK_FORCE_INLINE int toBaseLowerCase(const int c) { @@ -59,7 +53,6 @@ class CharUtils { // TODO: Do not hardcode here return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS; } - static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) { int size = 0; for (; size < arraySize; ++size) { @@ -91,9 +84,6 @@ class CharUtils { return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT; } - static unsigned short latin_tolower(const unsigned short c); - static const std::vector<int> EMPTY_STRING; - // Returns updated code point count. Returns 0 when the code points cannot be marked as a // Beginning-of-Sentence. static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints, @@ -111,6 +101,17 @@ class CharUtils { return codePointCount + 1; } + // Returns updated code point count. + static AK_FORCE_INLINE int removeBeginningOfSentenceMarker(int *const codePoints, + const int codePointCount) { + if (codePointCount <= 0 || codePoints[0] != CODE_POINT_BEGINNING_OF_SENTENCE) { + return codePointCount; + } + const int newCodePointCount = codePointCount - 1; + memmove(codePoints, codePoints + 1, sizeof(int) * newCodePointCount); + return newCodePointCount; + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils); @@ -125,6 +126,16 @@ class CharUtils { */ static const int BASE_CHARS_SIZE = 0x0500; static const unsigned short BASE_CHARS[BASE_CHARS_SIZE]; + + static AK_FORCE_INLINE bool isAscii(int c) { + return isascii(c) != 0; + } + + static AK_FORCE_INLINE int toAsciiLower(int c) { + return c - 'A' + 'a'; + } + + static int latin_tolower(const int c); }; } // namespace latinime #endif // LATINIME_CHAR_UTILS_H diff --git a/native/jni/src/utils/int_array_view.h b/native/jni/src/utils/int_array_view.h index c1ddc9812..e0f671056 100644 --- a/native/jni/src/utils/int_array_view.h +++ b/native/jni/src/utils/int_array_view.h @@ -17,8 +17,10 @@ #ifndef LATINIME_INT_ARRAY_VIEW_H #define LATINIME_INT_ARRAY_VIEW_H +#include <algorithm> +#include <array> #include <cstdint> -#include <cstdlib> +#include <cstring> #include <vector> #include "defines.h" @@ -56,14 +58,14 @@ class IntArrayView { explicit IntArrayView(const std::vector<int> &vector) : mPtr(vector.data()), mSize(vector.size()) {} - template <int N> - AK_FORCE_INLINE static IntArrayView fromFixedSizeArray(const int (&array)[N]) { - return IntArrayView(array, N); + template <size_t N> + AK_FORCE_INLINE static IntArrayView fromArray(const std::array<int, N> &array) { + return IntArrayView(array.data(), array.size()); } - // Returns a view that points one int object. Does not take ownership of the given object. - AK_FORCE_INLINE static IntArrayView fromObject(const int *const object) { - return IntArrayView(object, 1); + // Returns a view that points one int object. + AK_FORCE_INLINE static IntArrayView singleElementView(const int *const ptr) { + return IntArrayView(ptr, 1); } AK_FORCE_INLINE int operator[](const size_t index) const { @@ -91,6 +93,69 @@ class IntArrayView { return mPtr + mSize; } + AK_FORCE_INLINE bool contains(const int value) const { + return std::find(begin(), end(), value) != end(); + } + + // Returns the view whose size is smaller than or equal to the given count. + AK_FORCE_INLINE const IntArrayView limit(const size_t maxSize) const { + return IntArrayView(mPtr, std::min(maxSize, mSize)); + } + + AK_FORCE_INLINE const IntArrayView skip(const size_t n) const { + if (mSize <= n) { + return IntArrayView(); + } + return IntArrayView(mPtr + n, mSize - n); + } + + template <size_t N> + void copyToArray(std::array<int, N> *const buffer, const size_t offset) const { + ASSERT(mSize + offset <= N); + memmove(buffer->data() + offset, mPtr, sizeof(int) * mSize); + } + + AK_FORCE_INLINE int firstOrDefault(const int defaultValue) const { + if (empty()) { + return defaultValue; + } + return mPtr[0]; + } + + AK_FORCE_INLINE int lastOrDefault(const int defaultValue) const { + if (empty()) { + return defaultValue; + } + return mPtr[mSize - 1]; + } + + AK_FORCE_INLINE std::vector<int> toVector() const { + return std::vector<int>(begin(), end()); + } + + std::vector<IntArrayView> split(const int separator, const int limit = S_INT_MAX) const { + if (limit <= 0) { + return std::vector<IntArrayView>(); + } + std::vector<IntArrayView> result; + if (limit == 1) { + result.emplace_back(mPtr, mSize); + return result; + } + size_t startIndex = 0; + for (size_t i = 0; i < mSize; ++i) { + if (mPtr[i] == separator) { + result.emplace_back(mPtr + startIndex, i - startIndex); + startIndex = i + 1; + if (result.size() >= static_cast<size_t>(limit - 1)) { + break; + } + } + } + result.emplace_back(mPtr + startIndex, mSize - startIndex); + return result; + } + private: DISALLOW_ASSIGNMENT_OPERATOR(IntArrayView); @@ -100,6 +165,9 @@ class IntArrayView { using WordIdArrayView = IntArrayView; using PtNodePosArrayView = IntArrayView; +using CodePointArrayView = IntArrayView; +template <size_t size> +using WordIdArray = std::array<int, size>; } // namespace latinime #endif // LATINIME_MEMORY_VIEW_H diff --git a/native/jni/src/utils/jni_data_utils.cpp b/native/jni/src/utils/jni_data_utils.cpp index 5555293d5..41f0623d8 100644 --- a/native/jni/src/utils/jni_data_utils.cpp +++ b/native/jni/src/utils/jni_data_utils.cpp @@ -16,9 +16,100 @@ #include "utils/jni_data_utils.h" +#include "utils/int_array_view.h" + namespace latinime { const int JniDataUtils::CODE_POINT_REPLACEMENT_CHARACTER = 0xFFFD; const int JniDataUtils::CODE_POINT_NULL = 0; +/* static */ void JniDataUtils::outputWordProperty(JNIEnv *const env, + const WordProperty &wordProperty, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbabilityInfo, jobject outNgramPrevWordsArray, + jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets, + jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) { + const CodePointArrayView codePoints = wordProperty.getCodePoints(); + JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */, + MAX_WORD_LENGTH /* maxLength */, codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + const UnigramProperty &unigramProperty = wordProperty.getUnigramProperty(); + const std::vector<NgramProperty> &ngrams = wordProperty.getNgramProperties(); + jboolean flags[] = {unigramProperty.isNotAWord(), unigramProperty.isPossiblyOffensive(), + !ngrams.empty(), unigramProperty.hasShortcuts(), + unigramProperty.representsBeginningOfSentence()}; + env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); + const HistoricalInfo &historicalInfo = unigramProperty.getHistoricalInfo(); + int probabilityInfo[] = {unigramProperty.getProbability(), historicalInfo.getTimestamp(), + historicalInfo.getLevel(), historicalInfo.getCount()}; + env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), + probabilityInfo); + + jclass integerClass = env->FindClass("java/lang/Integer"); + jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V"); + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + + // Output ngrams. + jclass intArrayClass = env->FindClass("[I"); + for (const auto &ngramProperty : ngrams) { + const NgramContext *const ngramContext = ngramProperty.getNgramContext(); + jobjectArray prevWordWordCodePointsArray = env->NewObjectArray( + ngramContext->getPrevWordCount(), intArrayClass, nullptr); + jbooleanArray prevWordIsBeginningOfSentenceArray = + env->NewBooleanArray(ngramContext->getPrevWordCount()); + for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) { + const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1); + jintArray prevWordCodePoints = env->NewIntArray(codePoints.size()); + JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */, + codePoints.size(), codePoints.data(), codePoints.size(), + false /* needsNullTermination */); + env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints); + env->DeleteLocalRef(prevWordCodePoints); + JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i, + ngramContext->isNthPrevWordBeginningOfSentence(i + 1)); + } + env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray); + env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId, + prevWordIsBeginningOfSentenceArray); + env->DeleteLocalRef(prevWordWordCodePointsArray); + env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray); + + const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints(); + jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size()); + JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */, + targetWordCodePoints->size(), targetWordCodePoints->data(), + targetWordCodePoints->size(), false /* needsNullTermination */); + env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray); + env->DeleteLocalRef(targetWordCodePointArray); + + const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo(); + int bigramProbabilityInfo[] = {ngramProperty.getProbability(), + ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(), + ngramHistoricalInfo.getCount()}; + jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); + env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, + NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); + env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->DeleteLocalRef(bigramProbabilityInfoArray); + } + + // Output shortcuts. + for (const auto &shortcut : unigramProperty.getShortcuts()) { + const std::vector<int> *const targetCodePoints = shortcut.getTargetCodePoints(); + jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); + JniDataUtils::outputCodePoints(env, shortcutTargetCodePointArray, 0 /* start */, + targetCodePoints->size(), targetCodePoints->data(), targetCodePoints->size(), + false /* needsNullTermination */); + env->CallBooleanMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); + env->DeleteLocalRef(shortcutTargetCodePointArray); + jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, + shortcut.getProbability()); + env->CallBooleanMethod(outShortcutProbabilities, addMethodId, integerProbability); + env->DeleteLocalRef(integerProbability); + } + env->DeleteLocalRef(integerClass); + env->DeleteLocalRef(arrayListClass); +} + } // namespace latinime diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index cb82d3c3b..8024e34c4 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -20,10 +20,11 @@ #include <vector> #include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/interface/dictionary_header_structure_policy.h" +#include "dictionary/property/ngram_context.h" +#include "dictionary/property/word_property.h" #include "jni.h" -#include "suggest/core/session/prev_words_info.h" -#include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" #include "utils/char_utils.h" namespace latinime { @@ -50,6 +51,7 @@ class JniDataUtils { const jsize keyUtf8Length = env->GetStringUTFLength(keyString); char keyChars[keyUtf8Length + 1]; env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars); + env->DeleteLocalRef(keyString); keyChars[keyUtf8Length] = '\0'; DictionaryHeaderStructurePolicy::AttributeMap::key_type key; HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key); @@ -59,6 +61,7 @@ class JniDataUtils { const jsize valueUtf8Length = env->GetStringUTFLength(valueString); char valueChars[valueUtf8Length + 1]; env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars); + env->DeleteLocalRef(valueString); valueChars[valueUtf8Length] = '\0'; DictionaryHeaderStructurePolicy::AttributeMap::mapped_type value; HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); @@ -96,18 +99,14 @@ class JniDataUtils { } } - static PrevWordsInfo constructPrevWordsInfo(JNIEnv *env, jobjectArray prevWordCodePointArrays, - jbooleanArray isBeginningOfSentenceArray) { + static NgramContext constructNgramContext(JNIEnv *env, jobjectArray prevWordCodePointArrays, + jbooleanArray isBeginningOfSentenceArray, const size_t prevWordCount) { int prevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; int prevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; bool isBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; - jsize prevWordsCount = env->GetArrayLength(prevWordCodePointArrays); - for (size_t i = 0; i < NELEMS(prevWordCodePoints); ++i) { + for (size_t i = 0; i < prevWordCount; ++i) { prevWordCodePointCount[i] = 0; isBeginningOfSentence[i] = false; - if (prevWordsCount <= static_cast<int>(i)) { - continue; - } jintArray prevWord = (jintArray)env->GetObjectArrayElement(prevWordCodePointArrays, i); if (!prevWord) { continue; @@ -117,14 +116,15 @@ class JniDataUtils { continue; } env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]); + env->DeleteLocalRef(prevWord); prevWordCodePointCount[i] = prevWordLength; jboolean isBeginningOfSentenceBoolean = JNI_FALSE; env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */, &isBeginningOfSentenceBoolean); isBeginningOfSentence[i] = isBeginningOfSentenceBoolean == JNI_TRUE; } - return PrevWordsInfo(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence, - MAX_PREV_WORD_COUNT_FOR_N_GRAM); + return NgramContext(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence, + prevWordCount); } static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index, @@ -141,6 +141,12 @@ class JniDataUtils { env->SetFloatArrayRegion(array, index, 1 /* len */, &value); } + static void outputWordProperty(JNIEnv *const env, const WordProperty &wordProperty, + jintArray outCodePoints, jbooleanArray outFlags, jintArray outProbabilityInfo, + jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray, + jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(JniDataUtils); diff --git a/native/jni/src/utils/ngram_utils.h b/native/jni/src/utils/ngram_utils.h new file mode 100644 index 000000000..fa85ba35f --- /dev/null +++ b/native/jni/src/utils/ngram_utils.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_NGRAM_UTILS_H +#define LATINIME_NGRAM_UTILS_H + +#include "defines.h" + +namespace latinime { + +enum class NgramType : int { + Unigram = 0, + Bigram = 1, + Trigram = 2, + Quadgram = 3, + NotANgramType = -1, +}; + +namespace AllNgramTypes { +// Use anonymous namespace to avoid ODR (One Definition Rule) violation. +namespace { + +const NgramType ASCENDING[] = { + NgramType::Unigram, NgramType::Bigram, NgramType::Trigram +}; + +const NgramType DESCENDING[] = { + NgramType::Trigram, NgramType::Bigram, NgramType::Unigram +}; + +} // namespace +} // namespace AllNgramTypes + +class NgramUtils final { + public: + static AK_FORCE_INLINE NgramType getNgramTypeFromWordCount(const int wordCount) { + // Max supported ngram is (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram. + if (wordCount <= 0 || wordCount > MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1) { + return NgramType::NotANgramType; + } + // Convert word count to 0-origin enum value. + return static_cast<NgramType>(wordCount - 1); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(NgramUtils); + +}; +} +#endif /* LATINIME_NGRAM_UTILS_H */ diff --git a/native/jni/src/utils/profiler.h b/native/jni/src/utils/profiler.h new file mode 100644 index 000000000..5f107fed3 --- /dev/null +++ b/native/jni/src/utils/profiler.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROFILER_H +#define LATINIME_PROFILER_H + +#ifdef FLAG_DO_PROFILE + +#include "defines.h" + +#include <ctime> +#include <unordered_map> + +namespace latinime { + +class Profiler final { + public: + Profiler(const clockid_t clockId) + : mClockId(clockId), mStartTime(getTimeInMicroSec()), mStartTimes(), mTimes(), + mCounters() {} + + ~Profiler() { + const float totalTime = + static_cast<float>(getTimeInMicroSec() - mStartTime) / 1000.f; + AKLOGI("Total time is %6.3f ms.", totalTime); + for (const auto &time : mTimes) { + AKLOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.", time.first, + time.second / totalTime * 100.0f, time.second, mCounters[time.first]); + } + } + + void startTimer(const int id) { + mStartTimes[id] = getTimeInMicroSec(); + } + + void endTimer(const int id) { + mTimes[id] += static_cast<float>(getTimeInMicroSec() - mStartTimes[id]) / 1000.0f; + mCounters[id]++; + } + + operator bool() const { return false; } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Profiler); + + const clockid_t mClockId; + int64_t mStartTime; + std::unordered_map<int, int64_t> mStartTimes; + std::unordered_map<int, float> mTimes; + std::unordered_map<int, int> mCounters; + + int64_t getTimeInMicroSec() { + timespec time; + clock_gettime(mClockId, &time); + return static_cast<int64_t>(time.tv_sec) * 1000000 + + static_cast<int64_t>(time.tv_nsec) / 1000; + } +}; +} // namespace latinime + +#define PROF_INIT Profiler __LATINIME__PROFILER__(CLOCK_THREAD_CPUTIME_ID) +#define PROF_TIMER_START(timer_id) __LATINIME__PROFILER__.startTimer(timer_id) +#define PROF_TIMER_END(timer_id) __LATINIME__PROFILER__.endTimer(timer_id) + +#else // FLAG_DO_PROFILE + +#define PROF_INIT +#define PROF_TIMER_START(timer_id) +#define PROF_TIMER_END(timer_id) + +#endif // FLAG_DO_PROFILE + +#endif /* LATINIME_PROFILER_H */ |