diff options
Diffstat (limited to 'native')
77 files changed, 3093 insertions, 1935 deletions
diff --git a/native/jni/Android.mk b/native/jni/Android.mk index e14cf5a71..0e36f3df9 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -43,6 +43,7 @@ LATIN_IME_JNI_SRC_FILES := \ com_android_inputmethod_keyboard_ProximityInfo.cpp \ com_android_inputmethod_latin_BinaryDictionary.cpp \ com_android_inputmethod_latin_DicTraverseSession.cpp \ + com_android_inputmethod_latin_makedict_Ver3DictDecoder.cpp \ jni_common.cpp LATIN_IME_CORE_SRC_FILES := \ @@ -53,12 +54,7 @@ LATIN_IME_CORE_SRC_FILES := \ dic_nodes_cache.cpp) \ $(addprefix suggest/core/dictionary/, \ bigram_dictionary.cpp \ - binary_dictionary_format_utils.cpp \ - binary_dictionary_header.cpp \ - binary_dictionary_header_reading_utils.cpp \ - binary_dictionary_terminal_attributes_reading_utils.cpp \ bloom_filter.cpp \ - byte_array_utils.cpp \ dictionary.cpp \ digraph_utils.cpp \ multi_bigram_map.cpp) \ @@ -71,11 +67,23 @@ LATIN_IME_CORE_SRC_FILES := \ suggest/core/policy/weighting.cpp \ suggest/core/session/dic_traverse_session.cpp \ $(addprefix suggest/policyimpl/dictionary/, \ + bigram/bigram_list_read_write_utils.cpp \ + bigram/dynamic_bigram_list_policy.cpp \ + header/header_policy.cpp \ + header/header_reading_utils.cpp \ + shortcut/shortcut_list_reading_utils.cpp \ + dictionary_structure_with_buffer_policy_factory.cpp \ dynamic_patricia_trie_node_reader.cpp \ dynamic_patricia_trie_policy.cpp \ + dynamic_patricia_trie_reading_helper.cpp \ dynamic_patricia_trie_reading_utils.cpp \ + dynamic_patricia_trie_writing_helper.cpp \ patricia_trie_policy.cpp \ patricia_trie_reading_utils.cpp) \ + $(addprefix suggest/policyimpl/dictionary/utils/, \ + buffer_with_extendable_buffer.cpp \ + byte_array_utils.cpp \ + format_utils.cpp) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ $(addprefix suggest/policyimpl/typing/, \ scoring_params.cpp \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 8b46c2644..86c2394d1 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -18,37 +18,20 @@ #include "com_android_inputmethod_latin_BinaryDictionary.h" -#include <cerrno> #include <cstring> // for memset() -#include <fcntl.h> -#include <sys/mman.h> -#include <unistd.h> #include "defines.h" #include "jni.h" #include "jni_common.h" -#include "suggest/core/dictionary/binary_dictionary_format_utils.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/suggest_options.h" +#include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" #include "utils/autocorrection_threshold_utils.h" namespace latinime { class ProximityInfo; -// Helper method -static void releaseDictBuf(const void *dictBuf, const size_t length, const int fd) { - int ret = munmap(const_cast<void *>(dictBuf), length); - if (ret != 0) { - AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); - } - ret = close(fd); - if (ret != 0) { - AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); - } -} - static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring sourceDir, jlong dictOffset, jlong dictSize, jboolean isUpdatable) { PROF_OPEN; @@ -61,41 +44,16 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s char sourceDirChars[sourceDirUtf8Length + 1]; env->GetStringUTFRegion(sourceDir, 0, env->GetStringLength(sourceDir), sourceDirChars); sourceDirChars[sourceDirUtf8Length] = '\0'; - int fd = 0; - void *dictBuf = 0; - int offset = 0; - const bool updatableMmap = (isUpdatable == JNI_TRUE); - const int openMode = updatableMmap ? O_RDWR : O_RDONLY; - fd = open(sourceDirChars, openMode); - if (fd < 0) { - AKLOGE("DICT: Can't open sourceDir. sourceDirChars=%s errno=%d", sourceDirChars, errno); - return 0; - } - int pagesize = getpagesize(); - offset = static_cast<int>(dictOffset) % pagesize; - int adjDictOffset = static_cast<int>(dictOffset) - offset; - int adjDictSize = static_cast<int>(dictSize) + offset; - const int protMode = updatableMmap ? PROT_READ | PROT_WRITE : PROT_READ; - dictBuf = mmap(0, adjDictSize, protMode, MAP_PRIVATE, fd, adjDictOffset); - if (dictBuf == MAP_FAILED) { - AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy = + DictionaryStructureWithBufferPolicyFactory::newDictionaryStructureWithBufferPolicy( + sourceDirChars, static_cast<int>(sourceDirUtf8Length), + static_cast<int>(dictOffset), static_cast<int>(dictSize), + isUpdatable == JNI_TRUE); + if (!dictionaryStructureWithBufferPolicy) { return 0; } - dictBuf = static_cast<char *>(dictBuf) + offset; - if (!dictBuf) { - AKLOGE("DICT: dictBuf is null"); - return 0; - } - Dictionary *dictionary = 0; - if (BinaryDictionaryFormatUtils::UNKNOWN_VERSION - == BinaryDictionaryFormatUtils::detectFormatVersion(static_cast<uint8_t *>(dictBuf), - static_cast<int>(dictSize))) { - AKLOGE("DICT: dictionary format is unknown, bad magic number"); - releaseDictBuf(static_cast<const char *>(dictBuf) - offset, adjDictSize, fd); - } else { - dictionary = new Dictionary(env, dictBuf, static_cast<int>(dictSize), fd, offset, - updatableMmap); - } + + Dictionary *const dictionary = new Dictionary(env, dictionaryStructureWithBufferPolicy); PROF_END(66); PROF_CLOSE; return reinterpret_cast<jlong>(dictionary); @@ -104,13 +62,6 @@ static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring s static void latinime_BinaryDictionary_close(JNIEnv *env, jclass clazz, jlong dict) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; - const BinaryDictionaryInfo *const binaryDictionaryInfo = dictionary->getBinaryDictionaryInfo(); - const int dictBufOffset = binaryDictionaryInfo->getDictBufOffset(); - const void *dictBuf = binaryDictionaryInfo->getDictBuf(); - if (!dictBuf) return; - releaseDictBuf(static_cast<const char *>(dictBuf) - dictBufOffset, - binaryDictionaryInfo->getDictSize() + dictBufOffset, - binaryDictionaryInfo->getMmapFd()); delete dictionary; } diff --git a/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.cpp b/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.cpp new file mode 100644 index 000000000..15088b65a --- /dev/null +++ b/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "LatinIME: jni: Ver3DictDecoder" + +#include "com_android_inputmethod_latin_makedict_Ver3DictDecoder.h" + +#include "defines.h" +#include "jni.h" +#include "jni_common.h" + +namespace latinime { +static int latinime_Ver3DictDecoder_doNothing(JNIEnv *env, jclass clazz) { + // This is a phony method for test - it does nothing. It just returns some value + // unlikely to be in memory by chance for testing purposes. + // TODO: remove this method. + return 2097; +} + +static const JNINativeMethod sMethods[] = { + { + // TODO: remove this entry when we have one useful method in here + const_cast<char *>("doNothing"), + const_cast<char *>("()I"), + reinterpret_cast<void *>(latinime_Ver3DictDecoder_doNothing) + }, +}; + +int register_Ver3DictDecoder(JNIEnv *env) { + const char *const kClassPathName = + "com/android/inputmethod/latin/makedict/Ver3DictDecoder"; + return registerNativeMethods(env, kClassPathName, sMethods, NELEMS(sMethods)); +} +} // namespace latinime diff --git a/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.h b/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.h new file mode 100644 index 000000000..07e80f1d8 --- /dev/null +++ b/native/jni/com_android_inputmethod_latin_makedict_Ver3DictDecoder.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _COM_ANDROID_INPUTMETHOD_LATIN_MAKEDICT_VER3DICTDECODER_H +#define _COM_ANDROID_INPUTMETHOD_LATIN_MAKEDICT_VER3DICTDECODER_H + +#include "jni.h" + +namespace latinime { +int register_Ver3DictDecoder(JNIEnv *env); +} // namespace latinime +#endif // _COM_ANDROID_INPUTMETHOD_LATIN_MAKEDICT_VER3DICTDECODER_H diff --git a/native/jni/jni_common.cpp b/native/jni/jni_common.cpp index f2867d7c3..3a8f4362d 100644 --- a/native/jni/jni_common.cpp +++ b/native/jni/jni_common.cpp @@ -18,9 +18,12 @@ #include "jni_common.h" +#ifndef HOST_TOOL #include "com_android_inputmethod_keyboard_ProximityInfo.h" #include "com_android_inputmethod_latin_BinaryDictionary.h" #include "com_android_inputmethod_latin_DicTraverseSession.h" +#endif +#include "com_android_inputmethod_latin_makedict_Ver3DictDecoder.h" #include "defines.h" /* @@ -38,6 +41,7 @@ jint JNI_OnLoad(JavaVM *vm, void *reserved) { AKLOGE("ERROR: JNIEnv is invalid"); return -1; } +#ifndef HOST_TOOL if (!latinime::register_BinaryDictionary(env)) { AKLOGE("ERROR: BinaryDictionary native registration failed"); return -1; @@ -50,6 +54,11 @@ jint JNI_OnLoad(JavaVM *vm, void *reserved) { AKLOGE("ERROR: ProximityInfo native registration failed"); return -1; } +#endif + if (!latinime::register_Ver3DictDecoder(env)) { + AKLOGE("ERROR: Ver3DictDecoder native registration failed"); + return -1; + } /* success -- return valid version number */ return JNI_VERSION_1_6; } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_proximity_filter.h b/native/jni/src/suggest/core/dicnode/dic_node_proximity_filter.h deleted file mode 100644 index 1a39f2ef3..000000000 --- a/native/jni/src/suggest/core/dicnode/dic_node_proximity_filter.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DIC_NODE_PROXIMITY_FILTER_H -#define LATINIME_DIC_NODE_PROXIMITY_FILTER_H - -#include "defines.h" -#include "suggest/core/layout/proximity_info_state.h" -#include "suggest/core/layout/proximity_info_utils.h" -#include "suggest/core/policy/dictionary_structure_policy.h" - -namespace latinime { - -class DicNodeProximityFilter : public DictionaryStructurePolicy::NodeFilter { - public: - DicNodeProximityFilter(const ProximityInfoState *const pInfoState, - const int pointIndex, const bool exactOnly) - : mProximityInfoState(pInfoState), mPointIndex(pointIndex), mExactOnly(exactOnly) {} - - bool isFilteredOut(const int codePoint) const { - return !isProximityCodePoint(codePoint); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeProximityFilter); - - const ProximityInfoState *const mProximityInfoState; - const int mPointIndex; - const bool mExactOnly; - - // TODO: Move to proximity info state - bool isProximityCodePoint(const int codePoint) const { - if (!mProximityInfoState) { - return true; - } - if (mExactOnly) { - return mProximityInfoState->getPrimaryCodePointAt(mPointIndex) == codePoint; - } - const ProximityType matchedId = mProximityInfoState->getProximityType( - mPointIndex, codePoint, true /* checkProximityChars */); - return ProximityInfoUtils::isMatchOrProximityChar(matchedId); - } -}; -} // namespace latinime -#endif // LATINIME_DIC_NODE_PROXIMITY_FILTER_H diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp index 6b4ef2fea..bb54e608e 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.cpp @@ -19,12 +19,10 @@ #include <cstring> #include "suggest/core/dicnode/dic_node.h" -#include "suggest/core/dicnode/dic_node_proximity_filter.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/multi_bigram_map.h" #include "suggest/core/dictionary/probability_utils.h" -#include "suggest/core/policy/dictionary_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/char_utils.h" namespace latinime { @@ -33,17 +31,17 @@ namespace latinime { // Node initialization utils // /////////////////////////////// -/* static */ void DicNodeUtils::initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, +/* static */ void DicNodeUtils::initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const int prevWordNodePos, DicNode *const newRootNode) { - newRootNode->initAsRoot(binaryDictionaryInfo->getStructurePolicy()->getRootPosition(), - prevWordNodePos); + newRootNode->initAsRoot(dictionaryStructurePolicy->getRootPosition(), prevWordNodePos); } /*static */ void DicNodeUtils::initAsRootWithPreviousWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNode *const prevWordLastNode, DicNode *const newRootNode) { newRootNode->initAsRootWithPreviousWord( - prevWordLastNode, binaryDictionaryInfo->getStructurePolicy()->getRootPosition()); + prevWordLastNode, dictionaryStructurePolicy->getRootPosition()); } /* static */ void DicNodeUtils::initByCopy(DicNode *srcNode, DicNode *destNode) { @@ -53,37 +51,16 @@ namespace latinime { /////////////////////////////////// // Traverse node expansion utils // /////////////////////////////////// - -/* static */ void DicNodeUtils::createAndGetPassingChildNode(DicNode *dicNode, - const DicNodeProximityFilter *const childrenFilter, - DicNodeVector *childDicNodes) { - // Passing multiple chars node. No need to traverse child - const int codePoint = dicNode->getNodeTypedCodePoint(); - const int baseLowerCaseCodePoint = CharUtils::toBaseLowerCase(codePoint); - if (!childrenFilter->isFilteredOut(codePoint) - || CharUtils::isIntentionalOmissionCodePoint(baseLowerCaseCodePoint)) { - childDicNodes->pushPassingChild(dicNode); - } -} - /* static */ void DicNodeUtils::getAllChildDicNodes(DicNode *dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes) { - getProximityChildDicNodes(dicNode, binaryDictionaryInfo, 0, 0, false, childDicNodes); -} - -/* static */ void DicNodeUtils::getProximityChildDicNodes(DicNode *dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *childDicNodes) { if (dicNode->isTotalInputSizeExceedingLimit()) { return; } - const DicNodeProximityFilter childrenFilter(pInfoState, pointIndex, exactOnly); if (!dicNode->isLeavingNode()) { - DicNodeUtils::createAndGetPassingChildNode(dicNode, &childrenFilter, childDicNodes); + childDicNodes->pushPassingChild(dicNode); } else { - binaryDictionaryInfo->getStructurePolicy()->createAndGetAllChildNodes(dicNode, - binaryDictionaryInfo, &childrenFilter, childDicNodes); + dictionaryStructurePolicy->createAndGetAllChildNodes(dicNode, childDicNodes); } } @@ -94,12 +71,13 @@ namespace latinime { * Computes the combined bigram / unigram cost for the given dicNode. */ /* static */ float DicNodeUtils::getBigramNodeImprobability( - const BinaryDictionaryInfo *const binaryDictionaryInfo, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const node, MultiBigramMap *multiBigramMap) { if (node->hasMultipleWords() && !node->isValidMultipleWordSuggestion()) { return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } - const int probability = getBigramNodeProbability(binaryDictionaryInfo, node, multiBigramMap); + const int probability = getBigramNodeProbability(dictionaryStructurePolicy, node, + multiBigramMap); // TODO: This equation to calculate the improbability looks unreasonable. Investigate this. const float cost = static_cast<float>(MAX_PROBABILITY - probability) / static_cast<float>(MAX_PROBABILITY); @@ -107,7 +85,7 @@ namespace latinime { } /* static */ int DicNodeUtils::getBigramNodeProbability( - const BinaryDictionaryInfo *const binaryDictionaryInfo, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const node, MultiBigramMap *multiBigramMap) { const int unigramProbability = node->getProbability(); const int wordPos = node->getPos(); @@ -118,8 +96,8 @@ namespace latinime { return ProbabilityUtils::backoff(unigramProbability); } if (multiBigramMap) { - return multiBigramMap->getBigramProbability( - binaryDictionaryInfo, prevWordPos, wordPos, unigramProbability); + return multiBigramMap->getBigramProbability(dictionaryStructurePolicy, prevWordPos, + wordPos, unigramProbability); } return ProbabilityUtils::backoff(unigramProbability); } diff --git a/native/jni/src/suggest/core/dicnode/dic_node_utils.h b/native/jni/src/suggest/core/dicnode/dic_node_utils.h index 4f12b29f4..3fb351a61 100644 --- a/native/jni/src/suggest/core/dicnode/dic_node_utils.h +++ b/native/jni/src/suggest/core/dicnode/dic_node_utils.h @@ -23,41 +23,37 @@ namespace latinime { -class BinaryDictionaryInfo; class DicNode; -class DicNodeProximityFilter; class DicNodeVector; -class ProximityInfoState; +class DictionaryStructureWithBufferPolicy; class MultiBigramMap; class DicNodeUtils { public: static int appendTwoWords(const int *src0, const int16_t length0, const int *src1, const int16_t length1, int *dest); - static void initAsRoot(const BinaryDictionaryInfo *const binaryDictionaryInfo, + static void initAsRoot( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const int prevWordNodePos, DicNode *newRootNode); - static void initAsRootWithPreviousWord(const BinaryDictionaryInfo *const binaryDictionaryInfo, + static void initAsRootWithPreviousWord( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNode *prevWordLastNode, DicNode *newRootNode); static void initByCopy(DicNode *srcNode, DicNode *destNode); static void getAllChildDicNodes(DicNode *dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, DicNodeVector *childDicNodes); - static float getBigramNodeImprobability(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const DicNode *const node, MultiBigramMap *const multiBigramMap); - // TODO: Move to private - static void getProximityChildDicNodes(DicNode *dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const ProximityInfoState *pInfoState, const int pointIndex, bool exactOnly, + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, DicNodeVector *childDicNodes); + static float getBigramNodeImprobability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, + const DicNode *const node, MultiBigramMap *const multiBigramMap); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DicNodeUtils); // Max number of bigrams to look up static const int MAX_BIGRAMS_CONSIDERED_PER_CONTEXT = 500; - static int getBigramNodeProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, + static int getBigramNodeProbability( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy, const DicNode *const node, MultiBigramMap *multiBigramMap); - static void createAndGetPassingChildNode(DicNode *dicNode, - const DicNodeProximityFilter *const childrenFilter, DicNodeVector *childDicNodes); }; } // namespace latinime #endif // LATINIME_DIC_NODE_UTILS_H diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h index 45c7f5cf9..74eb5dfe7 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_output.h @@ -49,8 +49,10 @@ class DicNodeStateOutput { void addMergedNodeCodePoints(const uint16_t mergedNodeCodePointCount, const int *const mergedNodeCodePoints) { if (mergedNodeCodePoints) { + const int additionalCodePointCount = min(static_cast<int>(mergedNodeCodePointCount), + MAX_WORD_LENGTH - mOutputtedCodePointCount); memcpy(&mCodePointsBuf[mOutputtedCodePointCount], mergedNodeCodePoints, - mergedNodeCodePointCount * sizeof(mCodePointsBuf[0])); + additionalCodePointCount * sizeof(mCodePointsBuf[0])); mOutputtedCodePointCount = static_cast<uint16_t>( mOutputtedCodePointCount + mergedNodeCodePointCount); if (mOutputtedCodePointCount < MAX_WORD_LENGTH) { diff --git a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h index 5854f4f6e..f437c95f6 100644 --- a/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h +++ b/native/jni/src/suggest/core/dicnode/internal/dic_node_state_prevword.h @@ -69,11 +69,14 @@ class DicNodeStatePrevWord { const int prevWordNodePos, const int *const src0, const int16_t length0, const int *const src1, const int16_t length1, const int *const prevSpacePositions, const int lastInputIndex) { - mPrevWordCount = prevWordCount; + mPrevWordCount = min(prevWordCount, static_cast<int16_t>(MAX_RESULTS)); mPrevWordProbability = prevWordProbability; mPrevWordNodePos = prevWordNodePos; - const int twoWordsLen = + int twoWordsLen = DicNodeUtils::appendTwoWords(src0, length0, src1, length1, mPrevWord); + if (twoWordsLen >= MAX_WORD_LENGTH) { + twoWordsLen = MAX_WORD_LENGTH - 1; + } mPrevWord[twoWordsLen] = KEYCODE_SPACE; mPrevWordStart = length0; mPrevWordLength = static_cast<int16_t>(twoWordsLen + 1); diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index 3751ae500..e74a1dbc8 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -22,15 +22,16 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/probability_utils.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/char_utils.h" namespace latinime { -BigramDictionary::BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo) - : mBinaryDictionaryInfo(binaryDictionaryInfo) { +BigramDictionary::BigramDictionary( + const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy) + : mDictionaryStructurePolicy(dictionaryStructurePolicy) { if (DEBUG_DICT) { AKLOGI("BigramDictionary - constructor"); } @@ -112,13 +113,19 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng int bigramCount = 0; int unigramProbability = 0; int bigramBuffer[MAX_WORD_LENGTH]; - BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); + BinaryDictionaryBigramsIterator bigramsIt( + mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); while (bigramsIt.hasNext()) { bigramsIt.next(); - const int length = mBinaryDictionaryInfo->getStructurePolicy()-> - getCodePointsAndProbabilityAndReturnCodePointCount( - mBinaryDictionaryInfo, bigramsIt.getBigramPos(), MAX_WORD_LENGTH, - bigramBuffer, &unigramProbability); + if (bigramsIt.getBigramPos() == NOT_A_VALID_WORD_POS) { + continue; + } + const int codePointCount = mDictionaryStructurePolicy-> + getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), + MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); + if (codePointCount <= 0) { + continue; + } // Due to space constraints, the probability for bigrams is approximate - the lower the // unigram probability, the worse the precision. The theoritical maximum error in // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 @@ -126,8 +133,8 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng // here, but it can't get too bad. const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramsIt.getProbability()); - addWordBigram(bigramBuffer, length, probability, outBigramProbability, outBigramCodePoints, - outputTypes); + addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, + outBigramCodePoints, outputTypes); ++bigramCount; } return min(bigramCount, MAX_RESULTS); @@ -138,11 +145,10 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const { if (0 >= prevWordLength) return NOT_A_DICT_POS; - int pos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( - mBinaryDictionaryInfo, prevWord, prevWordLength, forceLowerCaseSearch); + int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, + forceLowerCaseSearch); if (NOT_A_VALID_WORD_POS == pos) return NOT_A_DICT_POS; - return mBinaryDictionaryInfo->getStructurePolicy()->getBigramsPositionOfNode( - mBinaryDictionaryInfo, pos); + return mDictionaryStructurePolicy->getBigramsPositionOfNode(pos); } bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *word1, @@ -150,11 +156,12 @@ bool BigramDictionary::isValidBigram(const int *word0, int length0, const int *w int pos = getBigramListPositionForWord(word0, length0, false /* forceLowerCaseSearch */); // getBigramListPositionForWord returns 0 if this word isn't in the dictionary or has no bigrams if (NOT_A_DICT_POS == pos) return false; - int nextWordPos = mBinaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( - mBinaryDictionaryInfo, word1, length1, false /* forceLowerCaseSearch */); + int nextWordPos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); if (NOT_A_VALID_WORD_POS == nextWordPos) return false; - BinaryDictionaryBigramsIterator bigramsIt(mBinaryDictionaryInfo, pos); + BinaryDictionaryBigramsIterator bigramsIt( + mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); while (bigramsIt.hasNext()) { bigramsIt.next(); if (bigramsIt.getBigramPos() == nextWordPos) { diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h index 438c34cac..99b964c49 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h @@ -21,11 +21,11 @@ namespace latinime { -class BinaryDictionaryInfo; +class DictionaryStructureWithBufferPolicy; class BigramDictionary { public: - BigramDictionary(const BinaryDictionaryInfo *const binaryDictionaryInfo); + BigramDictionary(const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy); int getPredictions(const int *word, int length, int *outBigramCodePoints, int *outBigramProbability, int *outputTypes) const; @@ -40,7 +40,7 @@ class BigramDictionary { int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, const bool forceLowerCaseSearch) const; - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; + const DictionaryStructureWithBufferPolicy *const mDictionaryStructurePolicy; }; } // namespace latinime #endif // LATINIME_BIGRAM_DICTIONARY_H diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h index 8cbb12998..d16ac47fe 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_bigrams_iterator.h @@ -18,51 +18,41 @@ #define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H #include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" namespace latinime { class BinaryDictionaryBigramsIterator { public: BinaryDictionaryBigramsIterator( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int pos) - : mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(pos), mBigramFlags(0), - mBigramPos(NOT_A_DICT_POS), mHasNext(pos != NOT_A_DICT_POS) {} + const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) + : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mHasNext(pos != NOT_A_DICT_POS) {} AK_FORCE_INLINE bool hasNext() const { return mHasNext; } AK_FORCE_INLINE void next() { - mBigramFlags = BinaryDictionaryTerminalAttributesReadingUtils::getFlagsAndForwardPointer( - mBinaryDictionaryInfo, &mPos); - mBigramPos = - BinaryDictionaryTerminalAttributesReadingUtils::getBigramAddressAndForwardPointer( - mBinaryDictionaryInfo, mBigramFlags, &mPos); - mHasNext = BinaryDictionaryTerminalAttributesReadingUtils::hasNext(mBigramFlags); + mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); } AK_FORCE_INLINE int getProbability() const { - return BinaryDictionaryTerminalAttributesReadingUtils::getProbabilityFromFlags( - mBigramFlags); + return mProbability; } AK_FORCE_INLINE int getBigramPos() const { return mBigramPos; } - AK_FORCE_INLINE int getFlags() const { - return mBigramFlags; - } - private: DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; + const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; int mPos; - BinaryDictionaryTerminalAttributesReadingUtils::BigramFlags mBigramFlags; int mBigramPos; + int mProbability; bool mHasNext; }; } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp deleted file mode 100644 index 91c643a5f..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/binary_dictionary_header.h" - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" - -namespace latinime { - -const char *const BinaryDictionaryHeader::MULTIPLE_WORDS_DEMOTION_RATE_KEY = - "MULTIPLE_WORDS_DEMOTION_RATE"; -const float BinaryDictionaryHeader::DEFAULT_MULTI_WORD_COST_MULTIPLIER = 1.0f; -const float BinaryDictionaryHeader::MULTI_WORD_COST_MULTIPLIER_SCALE = 100.0f; - -BinaryDictionaryHeader::BinaryDictionaryHeader( - const BinaryDictionaryInfo *const binaryDictionaryInfo) - : mBinaryDictionaryInfo(binaryDictionaryInfo), - mDictionaryFlags(BinaryDictionaryHeaderReadingUtils::getFlags(binaryDictionaryInfo)), - mSize(BinaryDictionaryHeaderReadingUtils::getHeaderSize(binaryDictionaryInfo)), - mMultiWordCostMultiplier(readMultiWordCostMultiplier()) {} - -float BinaryDictionaryHeader::readMultiWordCostMultiplier() const { - const int headerValue = BinaryDictionaryHeaderReadingUtils::readHeaderValueInt( - mBinaryDictionaryInfo, MULTIPLE_WORDS_DEMOTION_RATE_KEY); - if (headerValue == S_INT_MIN) { - // not found - return DEFAULT_MULTI_WORD_COST_MULTIPLIER; - } - if (headerValue <= 0) { - return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); - } - return MULTI_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(headerValue); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp deleted file mode 100644 index a57b0f859..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.cpp +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/binary_dictionary_header_reading_utils.h" - -#include <cctype> -#include <cstdlib> - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" - -namespace latinime { - -const int BinaryDictionaryHeaderReadingUtils::MAX_OPTION_KEY_LENGTH = 256; - -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_MAGIC_NUMBER_SIZE = 4; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_DICTIONARY_VERSION_SIZE = 2; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_FLAG_SIZE = 2; -const int BinaryDictionaryHeaderReadingUtils::VERSION_2_HEADER_SIZE_FIELD_SIZE = 4; - -const BinaryDictionaryHeaderReadingUtils::DictionaryFlags - BinaryDictionaryHeaderReadingUtils::NO_FLAGS = 0; -// Flags for special processing -// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or -// something very bad (like, the apocalypse) will happen. Please update both at the same time. -const BinaryDictionaryHeaderReadingUtils::DictionaryFlags - BinaryDictionaryHeaderReadingUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; -const BinaryDictionaryHeaderReadingUtils::DictionaryFlags - BinaryDictionaryHeaderReadingUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; -const BinaryDictionaryHeaderReadingUtils::DictionaryFlags - BinaryDictionaryHeaderReadingUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; - -/* static */ int BinaryDictionaryHeaderReadingUtils::getHeaderSize( - const BinaryDictionaryInfo *const binaryDictionaryInfo) { - switch (getHeaderVersion(binaryDictionaryInfo->getFormat())) { - case HEADER_VERSION_2: - // See the format of the header in the comment in - // BinaryDictionaryFormatUtils::detectFormatVersion() - return ByteArrayUtils::readUint32(binaryDictionaryInfo->getDictBuf(), - VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE - + VERSION_2_HEADER_FLAG_SIZE); - default: - return S_INT_MAX; - } -} - -/* static */ BinaryDictionaryHeaderReadingUtils::DictionaryFlags - BinaryDictionaryHeaderReadingUtils::getFlags( - const BinaryDictionaryInfo *const binaryDictionaryInfo) { - switch (getHeaderVersion(binaryDictionaryInfo->getFormat())) { - case HEADER_VERSION_2: - return ByteArrayUtils::readUint16(binaryDictionaryInfo->getDictBuf(), - VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE); - default: - return NO_FLAGS; - } -} - -// Returns if the key is found or not and reads the found value into outValue. -/* static */ bool BinaryDictionaryHeaderReadingUtils::readHeaderValue( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const char *const key, int *outValue, const int outValueSize) { - if (outValueSize <= 0) { - return false; - } - const int headerSize = getHeaderSize(binaryDictionaryInfo); - int pos = getHeaderOptionsPosition(binaryDictionaryInfo->getFormat()); - if (pos == NOT_A_DICT_POS) { - // The header doesn't have header options. - return false; - } - while (pos < headerSize) { - if(ByteArrayUtils::compareStringInBufferWithCharArray( - binaryDictionaryInfo->getDictBuf(), key, headerSize - pos, &pos) == 0) { - // The key was found. - const int length = ByteArrayUtils::readStringAndAdvancePosition( - binaryDictionaryInfo->getDictBuf(), outValueSize, outValue, &pos); - // Add a 0 terminator to the string. - outValue[length < outValueSize ? length : outValueSize - 1] = '\0'; - return true; - } - ByteArrayUtils::advancePositionToBehindString( - binaryDictionaryInfo->getDictBuf(), headerSize - pos, &pos); - } - // The key was not found. - return false; -} - -/* static */ int BinaryDictionaryHeaderReadingUtils::readHeaderValueInt( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key) { - const int bufferSize = LARGEST_INT_DIGIT_COUNT; - int intBuffer[bufferSize]; - char charBuffer[bufferSize]; - if (!readHeaderValue(binaryDictionaryInfo, key, intBuffer, bufferSize)) { - return S_INT_MIN; - } - for (int i = 0; i < bufferSize; ++i) { - charBuffer[i] = intBuffer[i]; - if (charBuffer[i] == '0') { - break; - } - if (!isdigit(charBuffer[i])) { - // If not a number, return S_INT_MIN - return S_INT_MIN; - } - } - return atoi(charBuffer); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h deleted file mode 100644 index 61748227e..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header_reading_utils.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_HEADER_READING_UTILS_H -#define LATINIME_DICTIONARY_HEADER_READING_UTILS_H - -#include <stdint.h> - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_format_utils.h" - -namespace latinime { - -class BinaryDictionaryInfo; - -class BinaryDictionaryHeaderReadingUtils { - public: - typedef uint16_t DictionaryFlags; - - static const int MAX_OPTION_KEY_LENGTH; - - static int getHeaderSize(const BinaryDictionaryInfo *const binaryDictionaryInfo); - - static DictionaryFlags getFlags(const BinaryDictionaryInfo *const binaryDictionaryInfo); - - static AK_FORCE_INLINE bool supportsDynamicUpdate(const DictionaryFlags flags) { - return (flags & SUPPORTS_DYNAMIC_UPDATE_FLAG) != 0; - } - - static AK_FORCE_INLINE bool requiresGermanUmlautProcessing(const DictionaryFlags flags) { - return (flags & GERMAN_UMLAUT_PROCESSING_FLAG) != 0; - } - - static AK_FORCE_INLINE bool requiresFrenchLigatureProcessing(const DictionaryFlags flags) { - return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; - } - - static AK_FORCE_INLINE int getHeaderOptionsPosition( - const BinaryDictionaryFormatUtils::FORMAT_VERSION dictionaryFormat) { - switch (getHeaderVersion(dictionaryFormat)) { - case HEADER_VERSION_2: - return VERSION_2_HEADER_MAGIC_NUMBER_SIZE + VERSION_2_HEADER_DICTIONARY_VERSION_SIZE - + VERSION_2_HEADER_FLAG_SIZE + VERSION_2_HEADER_SIZE_FIELD_SIZE; - break; - default: - return NOT_A_DICT_POS; - } - } - - static bool readHeaderValue( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const char *const key, int *outValue, const int outValueSize); - - static int readHeaderValueInt( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const char *const key); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryHeaderReadingUtils); - - enum HEADER_VERSION { - HEADER_VERSION_2, - UNKNOWN_HEADER_VERSION - }; - - static const int VERSION_2_HEADER_MAGIC_NUMBER_SIZE; - static const int VERSION_2_HEADER_DICTIONARY_VERSION_SIZE; - static const int VERSION_2_HEADER_FLAG_SIZE; - static const int VERSION_2_HEADER_SIZE_FIELD_SIZE; - - static const DictionaryFlags NO_FLAGS; - // Flags for special processing - // Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAGS) or - // something very bad (like, the apocalypse) will happen. Please update both at the same time. - static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; - static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; - static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; - static const DictionaryFlags CONTAINS_BIGRAMS_FLAG; - - static HEADER_VERSION getHeaderVersion( - const BinaryDictionaryFormatUtils::FORMAT_VERSION formatVersion) { - switch(formatVersion) { - case BinaryDictionaryFormatUtils::VERSION_2: - // Fall through - case BinaryDictionaryFormatUtils::VERSION_3: - return HEADER_VERSION_2; - default: - return UNKNOWN_HEADER_VERSION; - } - } -}; -} -#endif /* LATINIME_DICTIONARY_HEADER_READING_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h deleted file mode 100644 index cbea18f90..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_info.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BINARY_DICTIONARY_INFO_H -#define LATINIME_BINARY_DICTIONARY_INFO_H - -#include <stdint.h> - -#include "defines.h" -#include "jni.h" -#include "suggest/core/dictionary/binary_dictionary_format_utils.h" -#include "suggest/core/dictionary/binary_dictionary_header.h" -#include "suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h" -#include "utils/log_utils.h" - -namespace latinime { - -class BinaryDictionaryInfo { - public: - AK_FORCE_INLINE BinaryDictionaryInfo(JNIEnv *env, const uint8_t *const dictBuf, - const int dictSize, const int mmapFd, const int dictBufOffset, const bool isUpdatable) - : mDictBuf(dictBuf), mDictSize(dictSize), mMmapFd(mmapFd), - mDictBufOffset(dictBufOffset), mIsUpdatable(isUpdatable), - mDictionaryFormat(BinaryDictionaryFormatUtils::detectFormatVersion( - mDictBuf, mDictSize)), - mDictionaryHeader(this), mDictRoot(mDictBuf + mDictionaryHeader.getSize()), - mStructurePolicy(DictionaryStructurePolicyFactory::getDictionaryStructurePolicy( - mDictionaryFormat)) { - logDictionaryInfo(env); - } - - AK_FORCE_INLINE const uint8_t *getDictBuf() const { - return mDictBuf; - } - - AK_FORCE_INLINE int getDictSize() const { - return mDictSize; - } - - AK_FORCE_INLINE int getMmapFd() const { - return mMmapFd; - } - - AK_FORCE_INLINE int getDictBufOffset() const { - return mDictBufOffset; - } - - AK_FORCE_INLINE const uint8_t *getDictRoot() const { - return mDictRoot; - } - - AK_FORCE_INLINE BinaryDictionaryFormatUtils::FORMAT_VERSION getFormat() const { - return mDictionaryFormat; - } - - AK_FORCE_INLINE const BinaryDictionaryHeader *getHeader() const { - return &mDictionaryHeader; - } - - AK_FORCE_INLINE bool isDynamicallyUpdatable() const { - // TODO: Support dynamic dictionary formats. - const bool isUpdatableDictionaryFormat = false; - return mIsUpdatable && isUpdatableDictionaryFormat; - } - - AK_FORCE_INLINE const DictionaryStructurePolicy *getStructurePolicy() const { - return mStructurePolicy; - } - - private: - DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryInfo); - - const uint8_t *const mDictBuf; - const int mDictSize; - const int mMmapFd; - const int mDictBufOffset; - const bool mIsUpdatable; - const BinaryDictionaryFormatUtils::FORMAT_VERSION mDictionaryFormat; - const BinaryDictionaryHeader mDictionaryHeader; - const uint8_t *const mDictRoot; - const DictionaryStructurePolicy *const mStructurePolicy; - - AK_FORCE_INLINE void logDictionaryInfo(JNIEnv *const env) const { - const int BUFFER_SIZE = 16; - int dictionaryIdCodePointBuffer[BUFFER_SIZE]; - int versionStringCodePointBuffer[BUFFER_SIZE]; - int dateStringCodePointBuffer[BUFFER_SIZE]; - mDictionaryHeader.readHeaderValueOrQuestionMark("dictionary", - dictionaryIdCodePointBuffer, BUFFER_SIZE); - mDictionaryHeader.readHeaderValueOrQuestionMark("version", - versionStringCodePointBuffer, BUFFER_SIZE); - mDictionaryHeader.readHeaderValueOrQuestionMark("date", - dateStringCodePointBuffer, BUFFER_SIZE); - - char dictionaryIdCharBuffer[BUFFER_SIZE]; - char versionStringCharBuffer[BUFFER_SIZE]; - char dateStringCharBuffer[BUFFER_SIZE]; - intArrayToCharArray(dictionaryIdCodePointBuffer, BUFFER_SIZE, - dictionaryIdCharBuffer, BUFFER_SIZE); - intArrayToCharArray(versionStringCodePointBuffer, BUFFER_SIZE, - versionStringCharBuffer, BUFFER_SIZE); - intArrayToCharArray(dateStringCodePointBuffer, BUFFER_SIZE, - dateStringCharBuffer, BUFFER_SIZE); - - LogUtils::logToJava(env, - "Dictionary info: dictionary = %s ; version = %s ; date = %s ; filesize = %i", - dictionaryIdCharBuffer, versionStringCharBuffer, dateStringCharBuffer, mDictSize); - } -}; -} -#endif /* LATINIME_BINARY_DICTIONARY_INFO_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h new file mode 100644 index 000000000..558e0a5c3 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/binary_dictionary_shortcut_iterator.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator { + public: + BinaryDictionaryShortcutIterator( + const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, + const int shortcutPos) + : mShortcutStructurePolicy(shortcutStructurePolicy), + mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), + mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + + AK_FORCE_INLINE bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as an int string and put it to outTarget, put its length + // to outTargetLength, put whether it is whitelist to outIsWhitelist. + AK_FORCE_INLINE void nextShortcutTarget( + const int maxDepth, int *const outTarget, int *const outTargetLength, + bool *const outIsWhitelist) { + mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, + outIsWhitelist, &mHasNextShortcutTarget, &mPos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryShortcutIterator); + + const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; + int mPos; + bool mHasNextShortcutTarget; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.cpp b/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.cpp deleted file mode 100644 index 20b77b3b2..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.cpp +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" - -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/byte_array_utils.h" - -namespace latinime { - -typedef BinaryDictionaryTerminalAttributesReadingUtils TaUtils; - -const TaUtils::TerminalAttributeFlags TaUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; -const TaUtils::TerminalAttributeFlags TaUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; -const TaUtils::TerminalAttributeFlags TaUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; -const TaUtils::TerminalAttributeFlags TaUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; -const TaUtils::TerminalAttributeFlags TaUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; -// Flag for presence of more attributes -const TaUtils::TerminalAttributeFlags TaUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; -// Mask for attribute probability, stored on 4 bits inside the flags byte. -const TaUtils::TerminalAttributeFlags TaUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; -const int TaUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; -const int TaUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; -// The numeric value of the shortcut probability that means 'whitelist'. -const int TaUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; - -/* static */ int TaUtils::getBigramAddressAndForwardPointer( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const TerminalAttributeFlags flags, - int *const pos) { - int offset = 0; - const int origin = *pos; - switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: - offset = ByteArrayUtils::readUint8AndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), pos); - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: - offset = ByteArrayUtils::readUint16AndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), pos); - break; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: - offset = ByteArrayUtils::readUint24AndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), pos); - break; - } - if (isOffsetNegative(flags)) { - return origin - offset; - } else { - return origin + offset; - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h b/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h deleted file mode 100644 index 375fc7dff..000000000 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BINARY_DICTIONARY_TERMINAL_ATTRIBUTES_READING_UTILS_H -#define LATINIME_BINARY_DICTIONARY_TERMINAL_ATTRIBUTES_READING_UTILS_H - -#include <stdint.h> - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/byte_array_utils.h" - -namespace latinime { - -class BinaryDictionaryTerminalAttributesReadingUtils { - public: - typedef uint8_t TerminalAttributeFlags; - typedef TerminalAttributeFlags BigramFlags; - typedef TerminalAttributeFlags ShortcutFlags; - - static AK_FORCE_INLINE TerminalAttributeFlags getFlagsAndForwardPointer( - const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), pos); - } - - static AK_FORCE_INLINE int getProbabilityFromFlags(const TerminalAttributeFlags flags) { - return flags & MASK_ATTRIBUTE_PROBABILITY; - } - - static AK_FORCE_INLINE bool hasNext(const TerminalAttributeFlags flags) { - return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; - } - - // Bigrams reading methods - static AK_FORCE_INLINE void skipExistingBigrams( - const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { - BigramFlags flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos); - while (hasNext(flags)) { - *pos += attributeAddressSize(flags); - flags = getFlagsAndForwardPointer(binaryDictionaryInfo, pos); - } - *pos += attributeAddressSize(flags); - } - - static int getBigramAddressAndForwardPointer( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const BigramFlags flags, - int *const pos); - - // Shortcuts reading methods - // This method returns the size of the shortcut list region excluding the shortcut list size - // field at the beginning. - static AK_FORCE_INLINE int getShortcutListSizeAndForwardPointer( - const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { - // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. - return ByteArrayUtils::readUint16AndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), pos) - SHORTCUT_LIST_SIZE_FIELD_SIZE; - } - - static AK_FORCE_INLINE void skipShortcuts( - const BinaryDictionaryInfo *const binaryDictionaryInfo, int *const pos) { - const int shortcutListSize = getShortcutListSizeAndForwardPointer( - binaryDictionaryInfo, pos); - *pos += shortcutListSize; - } - - static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { - return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; - } - - static AK_FORCE_INLINE int readShortcutTarget( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int maxLength, - int *const outWord, int *const pos) { - return ByteArrayUtils::readStringAndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), maxLength, outWord, pos); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryTerminalAttributesReadingUtils); - - static const TerminalAttributeFlags MASK_ATTRIBUTE_ADDRESS_TYPE; - static const TerminalAttributeFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; - static const TerminalAttributeFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; - static const TerminalAttributeFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; - static const TerminalAttributeFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; - static const TerminalAttributeFlags FLAG_ATTRIBUTE_HAS_NEXT; - static const TerminalAttributeFlags MASK_ATTRIBUTE_PROBABILITY; - static const int ATTRIBUTE_ADDRESS_SHIFT; - static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; - static const int WHITELIST_SHORTCUT_PROBABILITY; - - static AK_FORCE_INLINE bool isOffsetNegative(const TerminalAttributeFlags flags) { - return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; - } - - static AK_FORCE_INLINE int attributeAddressSize(const TerminalAttributeFlags flags) { - return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; - /* Note: this is a value-dependant optimization of what may probably be - more readably written this way: - switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; - default: return 0; - } - */ - } -}; -} -#endif /* LATINIME_BINARY_DICTIONARY_TERMINAL_ATTRIBUTES_READING_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 4a9e38fe8..8418a608a 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -18,33 +18,35 @@ #include "suggest/core/dictionary/dictionary.h" -#include <map> // TODO: remove #include <stdint.h> #include "defines.h" -#include "jni.h" #include "suggest/core/dictionary/bigram_dictionary.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/core/suggest.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" #include "suggest/policyimpl/typing/typing_suggest_policy_factory.h" +#include "utils/log_utils.h" namespace latinime { -Dictionary::Dictionary(JNIEnv *env, void *dict, int dictSize, int mmapFd, - int dictBufOffset, bool isUpdatable) - : mBinaryDictionaryInfo(env, static_cast<const uint8_t *>(dict), dictSize, mmapFd, - dictBufOffset, isUpdatable), - mBigramDictionary(new BigramDictionary(&mBinaryDictionaryInfo)), +Dictionary::Dictionary(JNIEnv *env, + DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPolicy) + : mDictionaryStructureWithBufferPolicy(dictionaryStructureWithBufferPolicy), + mBigramDictionary(new BigramDictionary(mDictionaryStructureWithBufferPolicy)), mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { + logDictionaryInfo(env); } Dictionary::~Dictionary() { delete mBigramDictionary; delete mGestureSuggest; delete mTypingSuggest; + delete mDictionaryStructureWithBufferPolicy; } int Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, @@ -83,14 +85,12 @@ int Dictionary::getBigrams(const int *word, int length, int *outWords, int *freq } int Dictionary::getProbability(const int *word, int length) const { - const DictionaryStructurePolicy *const structurePolicy = - mBinaryDictionaryInfo.getStructurePolicy(); - int pos = structurePolicy->getTerminalNodePositionOfWord(&mBinaryDictionaryInfo, word, length, + int pos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord(word, length, false /* forceLowerCaseSearch */); if (NOT_A_VALID_WORD_POS == pos) { return NOT_A_PROBABILITY; } - return structurePolicy->getUnigramProbability(&mBinaryDictionaryInfo, pos); + return getDictionaryStructurePolicy()->getUnigramProbability(pos); } bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, int length1) const { @@ -98,32 +98,46 @@ bool Dictionary::isValidBigram(const int *word0, int length0, const int *word1, } void Dictionary::addUnigramWord(const int *const word, const int length, const int probability) { - if (!mBinaryDictionaryInfo.isDynamicallyUpdatable()) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: Dictionary::addUnigramWord() is called for non-updatable dictionary."); - return; - } - // TODO: Support dynamic update + mDictionaryStructureWithBufferPolicy->addUnigramWord(word, length, probability); } void Dictionary::addBigramWords(const int *const word0, const int length0, const int *const word1, const int length1, const int probability) { - if (!mBinaryDictionaryInfo.isDynamicallyUpdatable()) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: Dictionary::addBigramWords() is called for non-updatable dictionary."); - return; - } - // TODO: Support dynamic update + mDictionaryStructureWithBufferPolicy->addBigramWords(word0, length0, word1, length1, + probability); } void Dictionary::removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1) { - if (!mBinaryDictionaryInfo.isDynamicallyUpdatable()) { - // This method should not be called for non-updatable dictionary. - AKLOGI("Warning: Dictionary::removeBigramWords() is called for non-updatable dictionary."); - return; - } - // TODO: Support dynamic update + mDictionaryStructureWithBufferPolicy->removeBigramWords(word0, length0, word1, length1); +} + +void Dictionary::logDictionaryInfo(JNIEnv *const env) const { + const int BUFFER_SIZE = 16; + int dictionaryIdCodePointBuffer[BUFFER_SIZE]; + int versionStringCodePointBuffer[BUFFER_SIZE]; + int dateStringCodePointBuffer[BUFFER_SIZE]; + const DictionaryHeaderStructurePolicy *const headerPolicy = + getDictionaryStructurePolicy()->getHeaderStructurePolicy(); + headerPolicy->readHeaderValueOrQuestionMark("dictionary", dictionaryIdCodePointBuffer, + BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("version", versionStringCodePointBuffer, + BUFFER_SIZE); + headerPolicy->readHeaderValueOrQuestionMark("date", dateStringCodePointBuffer, BUFFER_SIZE); + + char dictionaryIdCharBuffer[BUFFER_SIZE]; + char versionStringCharBuffer[BUFFER_SIZE]; + char dateStringCharBuffer[BUFFER_SIZE]; + intArrayToCharArray(dictionaryIdCodePointBuffer, BUFFER_SIZE, + dictionaryIdCharBuffer, BUFFER_SIZE); + intArrayToCharArray(versionStringCodePointBuffer, BUFFER_SIZE, + versionStringCharBuffer, BUFFER_SIZE); + intArrayToCharArray(dateStringCodePointBuffer, BUFFER_SIZE, + dateStringCharBuffer, BUFFER_SIZE); + + LogUtils::logToJava(env, + "Dictionary info: dictionary = %s ; version = %s ; date = %s", + dictionaryIdCharBuffer, versionStringCharBuffer, dateStringCharBuffer); } } // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 9f1e0729d..0afe5a54b 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -21,11 +21,11 @@ #include "defines.h" #include "jni.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" namespace latinime { class BigramDictionary; +class DictionaryStructureWithBufferPolicy; class DicTraverseSession; class ProximityInfo; class SuggestInterface; @@ -53,8 +53,8 @@ class Dictionary { static const int KIND_FLAG_POSSIBLY_OFFENSIVE = 0x80000000; static const int KIND_FLAG_EXACT_MATCH = 0x40000000; - Dictionary(JNIEnv *env, void *dict, int dictSize, int mmapFd, int dictBufOffset, - bool isUpdatable); + Dictionary(JNIEnv *env, + DictionaryStructureWithBufferPolicy *const dictionaryStructureWithBufferPoilcy); int getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession *traverseSession, int *xcoordinates, int *ycoordinates, int *times, int *pointerIds, int *inputCodePoints, @@ -77,8 +77,8 @@ class Dictionary { void removeBigramWords(const int *const word0, const int length0, const int *const word1, const int length1); - const BinaryDictionaryInfo *getBinaryDictionaryInfo() const { - return &mBinaryDictionaryInfo; + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { + return mDictionaryStructureWithBufferPolicy; } virtual ~Dictionary(); @@ -86,10 +86,12 @@ class Dictionary { private: DISALLOW_IMPLICIT_CONSTRUCTORS(Dictionary); - const BinaryDictionaryInfo mBinaryDictionaryInfo; - const BigramDictionary *mBigramDictionary; - SuggestInterface *mGestureSuggest; - SuggestInterface *mTypingSuggest; + DictionaryStructureWithBufferPolicy *const mDictionaryStructureWithBufferPolicy; + const BigramDictionary *const mBigramDictionary; + const SuggestInterface *const mGestureSuggest; + const SuggestInterface *const mTypingSuggest; + + void logDictionaryInfo(JNIEnv *const env) const; }; } // namespace latinime #endif // LATINIME_DICTIONARY_H diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp index af378b1b7..3271c1bfb 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.cpp +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.cpp @@ -19,7 +19,7 @@ #include <cstdlib> #include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_header.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" #include "utils/char_utils.h" namespace latinime { @@ -35,8 +35,9 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = { DIGRAPH_TYPE_GERMAN_UMLAUT, DIGRAPH_TYPE_FRENCH_LIGATURES }; /* static */ bool DigraphUtils::hasDigraphForCodePoint( - const BinaryDictionaryHeader *const header, const int compositeGlyphCodePoint) { - const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(header); + const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint) { + const DigraphUtils::DigraphType digraphType = getDigraphTypeForDictionary(headerPolicy); if (DigraphUtils::getDigraphForDigraphTypeAndCodePoint(digraphType, compositeGlyphCodePoint)) { return true; } @@ -45,11 +46,11 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] = // Returns the digraph type associated with the given dictionary. /* static */ DigraphUtils::DigraphType DigraphUtils::getDigraphTypeForDictionary( - const BinaryDictionaryHeader *const header) { - if (header->requiresGermanUmlautProcessing()) { + const DictionaryHeaderStructurePolicy *const headerPolicy) { + if (headerPolicy->requiresGermanUmlautProcessing()) { return DIGRAPH_TYPE_GERMAN_UMLAUT; } - if (header->requiresFrenchLigatureProcessing()) { + if (headerPolicy->requiresFrenchLigatureProcessing()) { return DIGRAPH_TYPE_FRENCH_LIGATURES; } return DIGRAPH_TYPE_NONE; diff --git a/native/jni/src/suggest/core/dictionary/digraph_utils.h b/native/jni/src/suggest/core/dictionary/digraph_utils.h index 9d74fe3a6..6ae16e390 100644 --- a/native/jni/src/suggest/core/dictionary/digraph_utils.h +++ b/native/jni/src/suggest/core/dictionary/digraph_utils.h @@ -21,7 +21,7 @@ namespace latinime { -class BinaryDictionaryHeader; +class DictionaryHeaderStructurePolicy; class DigraphUtils { public: @@ -39,14 +39,15 @@ class DigraphUtils { typedef struct { int first; int second; int compositeGlyph; } digraph_t; - static bool hasDigraphForCodePoint( - const BinaryDictionaryHeader *const header, const int compositeGlyphCodePoint); + static bool hasDigraphForCodePoint(const DictionaryHeaderStructurePolicy *const headerPolicy, + const int compositeGlyphCodePoint); static int getDigraphCodePointForIndex(const int compositeGlyphCodePoint, const DigraphCodePointIndex digraphCodePointIndex); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DigraphUtils); - static DigraphType getDigraphTypeForDictionary(const BinaryDictionaryHeader *const header); + static DigraphType getDigraphTypeForDictionary( + const DictionaryHeaderStructurePolicy *const headerPolicy); static int getAllDigraphsForDigraphTypeAndReturnSize( const DigraphType digraphType, const digraph_t **const digraphs); static const digraph_t *getDigraphForCodePoint(const int compositeGlyphCodePoint); diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index d5eafe1bf..fb4a80083 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -21,9 +21,9 @@ #include "defines.h" #include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/bloom_filter.h" #include "suggest/core/dictionary/probability_utils.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "utils/hash_map_compat.h" namespace latinime { @@ -38,7 +38,7 @@ class MultiBigramMap { // Look up the bigram probability for the given word pair from the cached bigram maps. // Also caches the bigrams if there is space remaining and they have not been cached already. - int getBigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, + int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, const int wordPosition, const int nextWordPosition, const int unigramProbability) { hash_map_compat<int, BigramMap>::const_iterator mapPosition = mBigramMaps.find(wordPosition); @@ -46,12 +46,12 @@ class MultiBigramMap { return mapPosition->second.getBigramProbability(nextWordPosition, unigramProbability); } if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { - addBigramsForWordPosition(binaryDictionaryInfo, wordPosition); + addBigramsForWordPosition(structurePolicy, wordPosition); return mBigramMaps[wordPosition].getBigramProbability( nextWordPosition, unigramProbability); } - return readBigramProbabilityFromBinaryDictionary(binaryDictionaryInfo, - wordPosition, nextWordPosition, unigramProbability); + return readBigramProbabilityFromBinaryDictionary(structurePolicy, wordPosition, + nextWordPosition, unigramProbability); } void clear() { @@ -66,12 +66,16 @@ class MultiBigramMap { BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} ~BigramMap() {} - void init(const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) { - const int bigramsListPos = binaryDictionaryInfo->getStructurePolicy()-> - getBigramsPositionOfNode(binaryDictionaryInfo, nodePos); - BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos); + void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nodePos) { + const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); while (bigramsIt.hasNext()) { bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_VALID_WORD_POS) { + continue; + } mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); mBloomFilter.setInFilter(bigramsIt.getBigramPos()); } @@ -100,16 +104,16 @@ class MultiBigramMap { }; AK_FORCE_INLINE void addBigramsForWordPosition( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int position) { - mBigramMaps[position].init(binaryDictionaryInfo, position); + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int position) { + mBigramMaps[position].init(structurePolicy, position); } AK_FORCE_INLINE int readBigramProbabilityFromBinaryDictionary( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos, + const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const int nextWordPosition, const int unigramProbability) { - const int bigramsListPos = binaryDictionaryInfo->getStructurePolicy()-> - getBigramsPositionOfNode(binaryDictionaryInfo, nodePos); - BinaryDictionaryBigramsIterator bigramsIt(binaryDictionaryInfo, bigramsListPos); + const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); + BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), + bigramsListPos); while (bigramsIt.hasNext()) { bigramsIt.next(); if (bigramsIt.getBigramPos() == nextWordPosition) { diff --git a/native/jni/src/suggest/core/dictionary/probability_utils.h b/native/jni/src/suggest/core/dictionary/probability_utils.h index f450087d8..21fe355b8 100644 --- a/native/jni/src/suggest/core/dictionary/probability_utils.h +++ b/native/jni/src/suggest/core/dictionary/probability_utils.h @@ -41,7 +41,7 @@ class ProbabilityUtils { // the unigram probability to be the median value of the 17th step from the top. A value of // 0 for the bigram probability represents the middle of the 16th step from the top, // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); return unigramProbability diff --git a/native/jni/src/suggest/core/dictionary/shortcut_utils.h b/native/jni/src/suggest/core/dictionary/shortcut_utils.h index 3c2180937..461d7b454 100644 --- a/native/jni/src/suggest/core/dictionary/shortcut_utils.h +++ b/native/jni/src/suggest/core/dictionary/shortcut_utils.h @@ -19,21 +19,20 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node_utils.h" -#include "suggest/core/dictionary/terminal_attributes.h" +#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" namespace latinime { class ShortcutUtils { public: - static int outputShortcuts(const TerminalAttributes *const terminalAttributes, + static int outputShortcuts(BinaryDictionaryShortcutIterator *const shortcutIt, int outputWordIndex, const int finalScore, int *const outputCodePoints, int *const frequencies, int *const outputTypes, const bool sameAsTyped) { - TerminalAttributes::ShortcutIterator iterator = terminalAttributes->getShortcutIterator(); int shortcutTarget[MAX_WORD_LENGTH]; - while (iterator.hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) { + while (shortcutIt->hasNextShortcutTarget() && outputWordIndex < MAX_RESULTS) { bool isWhilelist; int shortcutTargetStringLength; - iterator.nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, + shortcutIt->nextShortcutTarget(MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetStringLength, &isWhilelist); int shortcutScore; int kind; diff --git a/native/jni/src/suggest/core/dictionary/terminal_attributes.h b/native/jni/src/suggest/core/dictionary/terminal_attributes.h deleted file mode 100644 index 0da6504eb..000000000 --- a/native/jni/src/suggest/core/dictionary/terminal_attributes.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (C) 2012 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_TERMINAL_ATTRIBUTES_H -#define LATINIME_TERMINAL_ATTRIBUTES_H - -#include <stdint.h> - -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" - -namespace latinime { - -/** - * This class encapsulates information about a terminal that allows to - * retrieve local node attributes like the list of shortcuts without - * exposing the format structure to the client. - */ -class TerminalAttributes { - public: - class ShortcutIterator { - public: - ShortcutIterator(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int shortcutPos, const bool hasShortcutList) - : mBinaryDictionaryInfo(binaryDictionaryInfo), mPos(shortcutPos), - mHasNextShortcutTarget(hasShortcutList) {} - - inline bool hasNextShortcutTarget() const { - return mHasNextShortcutTarget; - } - - // Gets the shortcut target itself as an int string and put it to outTarget, put its length - // to outTargetLength, put whether it is whitelist to outIsWhitelist. - AK_FORCE_INLINE void nextShortcutTarget( - const int maxDepth, int *const outTarget, int *const outTargetLength, - bool *const outIsWhitelist) { - const BinaryDictionaryTerminalAttributesReadingUtils::ShortcutFlags flags = - BinaryDictionaryTerminalAttributesReadingUtils::getFlagsAndForwardPointer( - mBinaryDictionaryInfo, &mPos); - mHasNextShortcutTarget = - BinaryDictionaryTerminalAttributesReadingUtils::hasNext(flags); - if (outIsWhitelist) { - *outIsWhitelist = - BinaryDictionaryTerminalAttributesReadingUtils::isWhitelist(flags); - } - if (outTargetLength) { - *outTargetLength = - BinaryDictionaryTerminalAttributesReadingUtils::readShortcutTarget( - mBinaryDictionaryInfo, maxDepth, outTarget, &mPos); - } - } - - private: - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; - int mPos; - bool mHasNextShortcutTarget; - }; - - TerminalAttributes(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int shortcutPos) - : mBinaryDictionaryInfo(binaryDictionaryInfo), mShortcutListSizePos(shortcutPos) {} - - inline ShortcutIterator getShortcutIterator() const { - int shortcutPos = mShortcutListSizePos; - const bool hasShortcutList = shortcutPos != NOT_A_DICT_POS; - if (hasShortcutList) { - BinaryDictionaryTerminalAttributesReadingUtils::getShortcutListSizeAndForwardPointer( - mBinaryDictionaryInfo, &shortcutPos); - } - // shortcutPos is never used if hasShortcutList is false. - return ShortcutIterator(mBinaryDictionaryInfo, shortcutPos, hasShortcutList); - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(TerminalAttributes); - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; - const int mShortcutListSizePos; -}; -} // namespace latinime -#endif // LATINIME_TERMINAL_ATTRIBUTES_H diff --git a/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h new file mode 100644 index 000000000..661ef1b1a --- /dev/null +++ b/native/jni/src/suggest/core/policy/dictionary_bigrams_structure_policy.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of bigrams. + */ +class DictionaryBigramsStructurePolicy { + public: + virtual ~DictionaryBigramsStructurePolicy() {} + + virtual void getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const pos) const = 0; + virtual void skipAllBigrams(int *const pos) const = 0; + + protected: + DictionaryBigramsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryBigramsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_BIGRAMS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h new file mode 100644 index 000000000..a6829b476 --- /dev/null +++ b/native/jni/src/suggest/core/policy/dictionary_header_structure_policy.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryHeaderStructurePolicy { + public: + virtual ~DictionaryHeaderStructurePolicy() {} + + virtual bool supportsDynamicUpdate() const = 0; + + virtual bool requiresGermanUmlautProcessing() const = 0; + + virtual bool requiresFrenchLigatureProcessing() const = 0; + + virtual float getMultiWordCostMultiplier() const = 0; + + virtual void readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const = 0; + + protected: + DictionaryHeaderStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryHeaderStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_HEADER_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h new file mode 100644 index 000000000..40b6c2de1 --- /dev/null +++ b/native/jni/src/suggest/core/policy/dictionary_shortcuts_structure_policy.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +/* + * This class abstracts structure of shortcuts. + */ +class DictionaryShortcutsStructurePolicy { + public: + virtual ~DictionaryShortcutsStructurePolicy() {} + + virtual int getStartPos(const int pos) const = 0; + + virtual void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const = 0; + + virtual void skipAllShortcuts(int *const pos) const = 0; + + protected: + DictionaryShortcutsStructurePolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryShortcutsStructurePolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_SHORTCUTS_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_policy.h deleted file mode 100644 index cc14c982c..000000000 --- a/native/jni/src/suggest/core/policy/dictionary_structure_policy.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H -#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H - -#include "defines.h" - -namespace latinime { - -class BinaryDictionaryInfo; -class DicNode; -class DicNodeVector; - -/* - * This class abstracts structure of dictionaries. - * Implement this policy to support additional dictionaries. - */ -class DictionaryStructurePolicy { - public: - // This provides a filtering method for filtering new node. - class NodeFilter { - public: - virtual bool isFilteredOut(const int codePoint) const = 0; - - protected: - NodeFilter() {} - virtual ~NodeFilter() {} - - private: - DISALLOW_COPY_AND_ASSIGN(NodeFilter); - }; - - virtual int getRootPosition() const = 0; - - virtual void createAndGetAllChildNodes(const DicNode *const dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const = 0; - - virtual int getCodePointsAndProbabilityAndReturnCodePointCount( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos, const int maxCodePointCount, int *const outCodePoints, - int *const outUnigramProbability) const = 0; - - virtual int getTerminalNodePositionOfWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, - const int length, const bool forceLowerCaseSearch) const = 0; - - virtual int getUnigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const = 0; - - virtual int getShortcutPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const = 0; - - virtual int getBigramsPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const = 0; - - protected: - DictionaryStructurePolicy() {} - virtual ~DictionaryStructurePolicy() {} - - private: - DISALLOW_COPY_AND_ASSIGN(DictionaryStructurePolicy); -}; -} // namespace latinime -#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h new file mode 100644 index 000000000..532411509 --- /dev/null +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_H +#define LATINIME_DICTIONARY_STRUCTURE_POLICY_H + +#include "defines.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; +class DictionaryBigramsStructurePolicy; +class DictionaryHeaderStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class abstracts structure of dictionaries. + * Implement this policy to support additional dictionaries. + */ +class DictionaryStructureWithBufferPolicy { + public: + virtual ~DictionaryStructureWithBufferPolicy() {} + + virtual int getRootPosition() const = 0; + + virtual void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const = 0; + + virtual int getCodePointsAndProbabilityAndReturnCodePointCount( + const int nodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const = 0; + + virtual int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const = 0; + + virtual int getUnigramProbability(const int nodePos) const = 0; + + virtual int getShortcutPositionOfNode(const int nodePos) const = 0; + + virtual int getBigramsPositionOfNode(const int nodePos) const = 0; + + virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; + + virtual const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const = 0; + + virtual const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const = 0; + + // Returns whether the update was success or not. + virtual bool addUnigramWord(const int *const word, const int length, + const int probability) = 0; + + // Returns whether the update was success or not. + virtual bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) = 0; + + // Returns whether the update was success or not. + virtual bool removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) = 0; + + protected: + DictionaryStructureWithBufferPolicy() {} + + private: + DISALLOW_COPY_AND_ASSIGN(DictionaryStructureWithBufferPolicy); +}; +} // namespace latinime +#endif /* LATINIME_DICTIONARY_STRUCTURE_POLICY_H */ diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp index 58729229f..f9b777df2 100644 --- a/native/jni/src/suggest/core/policy/weighting.cpp +++ b/native/jni/src/suggest/core/policy/weighting.cpp @@ -148,7 +148,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n case CT_TERMINAL: { const float languageImprobability = DicNodeUtils::getBigramNodeImprobability( - traverseSession->getBinaryDictionaryInfo(), dicNode, multiBigramMap); + traverseSession->getDictionaryStructurePolicy(), dicNode, multiBigramMap); return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability); } case CT_TERMINAL_INSERTION: diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp index 7651b19a0..0ca583f90 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp +++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp @@ -17,32 +17,30 @@ #include "suggest/core/session/dic_traverse_session.h" #include "defines.h" -#include "jni.h" -#include "suggest/core/dictionary/binary_dictionary_header.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/core/dictionary/dictionary.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" namespace latinime { void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord, int prevWordLength, const SuggestOptions *const suggestOptions) { mDictionary = dictionary; - const BinaryDictionaryInfo *const binaryDictionaryInfo = - mDictionary->getBinaryDictionaryInfo(); - mMultiWordCostMultiplier = binaryDictionaryInfo->getHeader()->getMultiWordCostMultiplier(); + mMultiWordCostMultiplier = getDictionaryStructurePolicy()->getHeaderStructurePolicy() + ->getMultiWordCostMultiplier(); mSuggestOptions = suggestOptions; if (!prevWord) { mPrevWordPos = NOT_A_VALID_WORD_POS; return; } // TODO: merge following similar calls to getTerminalPosition into one case-insensitive call. - mPrevWordPos = binaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( - binaryDictionaryInfo, prevWord, prevWordLength, false /* forceLowerCaseSearch */); + mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + prevWord, prevWordLength, false /* forceLowerCaseSearch */); if (mPrevWordPos == NOT_A_VALID_WORD_POS) { // Check bigrams for lower-cased previous word if original was not found. Useful for // auto-capitalized words like "The [current_word]". - mPrevWordPos = binaryDictionaryInfo->getStructurePolicy()->getTerminalNodePositionOfWord( - binaryDictionaryInfo, prevWord, prevWordLength, true /* forceLowerCaseSearch */); + mPrevWordPos = getDictionaryStructurePolicy()->getTerminalNodePositionOfWord( + prevWord, prevWordLength, true /* forceLowerCaseSearch */); } } @@ -56,8 +54,9 @@ void DicTraverseSession::setupForGetSuggestions(const ProximityInfo *pInfo, maxSpatialDistance, maxPointerCount); } -const BinaryDictionaryInfo *DicTraverseSession::getBinaryDictionaryInfo() const { - return mDictionary->getBinaryDictionaryInfo(); +const DictionaryStructureWithBufferPolicy *DicTraverseSession::getDictionaryStructurePolicy() + const { + return mDictionary->getDictionaryStructurePolicy(); } void DicTraverseSession::resetCache(const int nextActiveCacheSize, const int maxWords) { diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h index de57e041a..23de5cc65 100644 --- a/native/jni/src/suggest/core/session/dic_traverse_session.h +++ b/native/jni/src/suggest/core/session/dic_traverse_session.h @@ -28,8 +28,8 @@ namespace latinime { -class BinaryDictionaryInfo; class Dictionary; +class DictionaryStructureWithBufferPolicy; class ProximityInfo; class SuggestOptions; @@ -75,8 +75,7 @@ class DicTraverseSession { const int maxPointerCount); void resetCache(const int nextActiveCacheSize, const int maxWords); - // TODO: Remove - const BinaryDictionaryInfo *getBinaryDictionaryInfo() const; + const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const; //-------------------- // getters and setters diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 9376d7b93..7d8dd21c5 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -19,12 +19,12 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_priority_queue.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" +#include "suggest/core/dictionary/binary_dictionary_shortcut_iterator.h" #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/dictionary/digraph_utils.h" #include "suggest/core/dictionary/shortcut_utils.h" -#include "suggest/core/dictionary/terminal_attributes.h" #include "suggest/core/layout/proximity_info.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/policy/scoring.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/policy/weighting.h" @@ -107,7 +107,7 @@ void Suggest::initializeSearch(DicTraverseSession *traverseSession, int commitPo MAX_RESULTS); // Create a new dic node here DicNode rootNode; - DicNodeUtils::initAsRoot(traverseSession->getBinaryDictionaryInfo(), + DicNodeUtils::initAsRoot(traverseSession->getDictionaryStructurePolicy(), traverseSession->getPrevWordPos(), &rootNode); traverseSession->getDicTraverseCache()->copyPushActive(&rootNode); } @@ -211,15 +211,14 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen } if (!terminalDicNode->hasMultipleWords()) { - const BinaryDictionaryInfo *const binaryDictionaryInfo = - traverseSession->getBinaryDictionaryInfo(); - const TerminalAttributes terminalAttributes(traverseSession->getBinaryDictionaryInfo(), - binaryDictionaryInfo->getStructurePolicy()->getShortcutPositionOfNode( - binaryDictionaryInfo, terminalDicNode->getPos())); + BinaryDictionaryShortcutIterator shortcutIt( + traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), + traverseSession->getDictionaryStructurePolicy() + ->getShortcutPositionOfNode(terminalDicNode->getPos())); // Shortcut is not supported for multiple words suggestions. // TODO: Check shortcuts during traversal for multiple words suggestions. const bool sameAsTyped = TRAVERSAL->sameAsTyped(traverseSession, terminalDicNode); - outputWordIndex = ShortcutUtils::outputShortcuts(&terminalAttributes, outputWordIndex, + outputWordIndex = ShortcutUtils::outputShortcuts(&shortcutIt, outputWordIndex, finalScore, outputCodePoints, frequencies, outputTypes, sameAsTyped); } DicNode::managedDelete(terminalDicNode); @@ -298,7 +297,7 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { } DicNodeUtils::getAllChildDicNodes( - &dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes); + &dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); const int childDicNodesSize = childDicNodes.getSizeAndLock(); for (int i = 0; i < childDicNodesSize; ++i) { @@ -309,7 +308,8 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const { continue; } if (DigraphUtils::hasDigraphForCodePoint( - traverseSession->getBinaryDictionaryInfo()->getHeader(), + traverseSession->getDictionaryStructurePolicy() + ->getHeaderStructurePolicy(), childDicNode->getNodeCodePoint())) { correctionDicNode.initByCopy(childDicNode); correctionDicNode.advanceDigraphIndex(); @@ -447,7 +447,7 @@ void Suggest::processDicNodeAsOmission( DicTraverseSession *traverseSession, DicNode *dicNode) const { DicNodeVector childDicNodes; DicNodeUtils::getAllChildDicNodes( - dicNode, traverseSession->getBinaryDictionaryInfo(), &childDicNodes); + dicNode, traverseSession->getDictionaryStructurePolicy(), &childDicNodes); const int size = childDicNodes.getSizeAndLock(); for (int i = 0; i < size; i++) { @@ -456,7 +456,6 @@ void Suggest::processDicNodeAsOmission( Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_OMISSION, traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); weightChildNode(traverseSession, childDicNode); - if (!TRAVERSAL->isPossibleOmissionChildNode(traverseSession, dicNode, childDicNode)) { continue; } @@ -472,10 +471,14 @@ void Suggest::processDicNodeAsInsertion(DicTraverseSession *traverseSession, DicNode *dicNode) const { const int16_t pointIndex = dicNode->getInputIndex(0); DicNodeVector childDicNodes; - DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(), - traverseSession->getProximityInfoState(0), pointIndex + 1, true, &childDicNodes); + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes); const int size = childDicNodes.getSizeAndLock(); for (int i = 0; i < size; i++) { + if (traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex + 1) + != childDicNodes[i]->getNodeCodePoint()) { + continue; + } DicNode *const childDicNode = childDicNodes[i]; Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_INSERTION, traverseSession, dicNode, childDicNode, 0 /* multiBigramMap */); @@ -490,18 +493,29 @@ void Suggest::processDicNodeAsTransposition(DicTraverseSession *traverseSession, DicNode *dicNode) const { const int16_t pointIndex = dicNode->getInputIndex(0); DicNodeVector childDicNodes1; - DicNodeUtils::getProximityChildDicNodes(dicNode, traverseSession->getBinaryDictionaryInfo(), - traverseSession->getProximityInfoState(0), pointIndex + 1, false, &childDicNodes1); + DicNodeUtils::getAllChildDicNodes(dicNode, traverseSession->getDictionaryStructurePolicy(), + &childDicNodes1); const int childSize1 = childDicNodes1.getSizeAndLock(); for (int i = 0; i < childSize1; i++) { + const ProximityType matchedId1 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex + 1, childDicNodes1[i]->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId1)) { + continue; + } if (childDicNodes1[i]->hasChildren()) { DicNodeVector childDicNodes2; - DicNodeUtils::getProximityChildDicNodes( - childDicNodes1[i], traverseSession->getBinaryDictionaryInfo(), - traverseSession->getProximityInfoState(0), pointIndex, false, &childDicNodes2); + DicNodeUtils::getAllChildDicNodes(childDicNodes1[i], + traverseSession->getDictionaryStructurePolicy(), &childDicNodes2); const int childSize2 = childDicNodes2.getSizeAndLock(); for (int j = 0; j < childSize2; j++) { DicNode *const childDicNode2 = childDicNodes2[j]; + const ProximityType matchedId2 = traverseSession->getProximityInfoState(0) + ->getProximityType(pointIndex, childDicNode2->getNodeCodePoint(), + true /* checkProximityChars */); + if (!ProximityInfoUtils::isMatchOrProximityChar(matchedId2)) { + continue; + } Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_TRANSPOSITION, traverseSession, childDicNodes1[i], childDicNode2, 0 /* multiBigramMap */); processExpandedDicNode(traverseSession, childDicNode2); @@ -538,7 +552,7 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode // Create a non-cached node here. DicNode newDicNode; DicNodeUtils::initAsRootWithPreviousWord( - traverseSession->getBinaryDictionaryInfo(), dicNode, &newDicNode); + traverseSession->getDictionaryStructurePolicy(), dicNode, &newDicNode); const CorrectionType correctionType = spaceSubstitution ? CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION; Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode, diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h new file mode 100644 index 000000000..26015d512 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_POLICY_H +#define LATINIME_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +namespace latinime { + +class BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + explicit BigramListPolicy(const uint8_t *const bigramsBuf) : mBigramsBuf(bigramsBuf) {} + + ~BigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + const BigramListReadWriteUtils::BigramFlags flags = + BigramListReadWriteUtils::getFlagsAndForwardPointer(mBigramsBuf, pos); + *outBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + mBigramsBuf, flags, pos); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + } + + void skipAllBigrams(int *const pos) const { + BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); + + const uint8_t *const mBigramsBuf; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp new file mode 100644 index 000000000..d57547445 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = + 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; + +/* static */ int BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); + break; + } + if (offset == 0) { + return NOT_A_VALID_WORD_POS; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h new file mode 100644 index 000000000..ee2b722a4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H + +#include <cstdlib> +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +class BigramListReadWriteUtils { +public: + typedef uint8_t BigramFlags; + + static AK_FORCE_INLINE BigramFlags getFlagsAndForwardPointer( + const uint8_t *const bigramsBuf, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); + } + + static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // Bigrams reading methods + static AK_FORCE_INLINE void skipExistingBigrams(const uint8_t *const bigramsBuf, + int *const pos) { + BigramFlags flags = getFlagsAndForwardPointer(bigramsBuf, pos); + while (hasNext(flags)) { + *pos += attributeAddressSize(flags); + flags = getFlagsAndForwardPointer(bigramsBuf, pos); + } + *pos += attributeAddressSize(flags); + } + + static int getBigramAddressAndForwardPointer(const uint8_t *const bigramsBuf, + const BigramFlags flags, int *const pos); + + // Returns the size of the bigram position field that is stored in bigram flags. + static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { + return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ + } + + static AK_FORCE_INLINE BigramFlags setHasNextFlag(const BigramFlags flags) { + return flags | FLAG_ATTRIBUTE_HAS_NEXT; + } + + // Returns true if the bigram entry is valid and put entry values into out*. + static AK_FORCE_INLINE bool createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + const int entryPos, const int targetPos, const int probability, const bool hasNext, + BigramFlags *const outBigramFlags, uint32_t *const outOffset, + int *const outOffsetFieldSize) { + if (targetPos == NOT_A_VALID_WORD_POS) { + return false; + } + BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; + if (hasNext) { + flags |= FLAG_ATTRIBUTE_HAS_NEXT; + } + const int targetFieldPos = entryPos + 1; + const int offset = targetPos - targetFieldPos; + if (offset < 0) { + flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + } + const uint32_t absOffest = abs(offset); + if ((absOffest >> 24) != 0) { + // Offset is too large. + return false; + } else if ((absOffest >> 16) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + *outOffsetFieldSize = 3; + } else if ((absOffest >> 8) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + *outOffsetFieldSize = 2; + } else { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + *outOffsetFieldSize = 1; + } + *outBigramFlags = flags; + *outOffset = absOffest; + return true; + } + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + static const int ATTRIBUTE_ADDRESS_SHIFT; + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp new file mode 100644 index 000000000..e31a91069 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" + +namespace latinime { + +bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, fromPos); + int bigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + buffer, flags, fromPos); + if (bigramPos == NOT_A_VALID_WORD_POS) { + // skip invalid bigram entry. + continue; + } + if (usesAdditionalBuffer) { + bigramPos += mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags newBigramFlags; + uint32_t newBigramOffset; + int newBigramOffsetFieldSize; + if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + *toPos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags), + BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset, + &newBigramOffsetFieldSize)) { + continue; + } + // Write bigram entry. Target buffer is always the additional buffer. + if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */,toPos)) { + return false; + } + if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, + toPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(flags)); + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int probability, + int *const pos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + int entryPos = *pos; + if (usesAdditionalBuffer) { + entryPos += mBuffer->getOriginalBufferSize(); + } + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); + BigramListReadWriteUtils::getBigramAddressAndForwardPointer(buffer, flags, pos); + if (BigramListReadWriteUtils::hasNext(flags)) { + continue; + } + // The current last entry is found. + // First, update the flags of the last entry. + const BigramListReadWriteUtils::BigramFlags updatedFlags = + BigramListReadWriteUtils::setHasNextFlag(flags); + if (!mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos)) { + return false; + } + // Then, add a new entry after the last entry. + BigramListReadWriteUtils::BigramFlags newBigramFlags; + uint32_t newBigramOffset; + int newBigramOffsetFieldSize; + if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + *pos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags), + BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset, + &newBigramOffsetFieldSize)) { + continue; + } + int newEntryPos = *pos; + if (usesAdditionalBuffer) { + newEntryPos += mBuffer->getOriginalBufferSize(); + } + // Write bigram flags. + if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */, + &newEntryPos)) { + return false; + } + // Write bigram positon offset. + if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, + &newEntryPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(flags)); + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int targetBigramPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = bigramListPos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, &pos); + int bigramOffsetFieldPos = pos; + if (usesAdditionalBuffer) { + bigramOffsetFieldPos += mBuffer->getOriginalBufferSize(); + } + int bigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + buffer, flags, &pos); + if (usesAdditionalBuffer && bigramPos != NOT_A_VALID_WORD_POS) { + bigramPos += mBuffer->getOriginalBufferSize(); + } + if (bigramPos != targetBigramPos) { + continue; + } + // Target entry is found. Write 0 into the bigram pos field to mark the bigram invalid. + const int bigramOffsetFieldSize = + BigramListReadWriteUtils::attributeAddressSize(flags); + if (!mBuffer->writeUintAndAdvancePosition(0 /* data */, bigramOffsetFieldSize, + &bigramOffsetFieldPos)) { + return false; + } + return true; + } while(BigramListReadWriteUtils::hasNext(flags)); + return false; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h new file mode 100644 index 000000000..b7c05376d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H +#define LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +/* + * This is a dynamic version of BigramListPolicy and supports an additional buffer. + */ +class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~DynamicBigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const BigramListReadWriteUtils::BigramFlags flags = + BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); + *outBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + buffer, flags, pos); + if (usesAdditionalBuffer && *outBigramPos != NOT_A_VALID_WORD_POS) { + *outBigramPos += mBuffer->getOriginalBufferSize(); + } + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + void skipAllBigrams(int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::skipExistingBigrams(buffer, pos); + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + // Copy bigrams from the bigram list that starts at fromPos to toPos and advance these + // positions after bigram lists. This method skips invalid bigram entries. + bool copyAllBigrams(int *const fromPos, int *const toPos); + + bool addBigramEntry(const int bigramPos, const int probability, int *const pos); + + // Return if targetBigramPos is found or not. + bool removeBigram(const int bigramListPos, const int targetBigramPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); + + BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/binary_format.h b/native/jni/src/suggest/policyimpl/dictionary/binary_format.h deleted file mode 100644 index 23f4c7fec..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/binary_format.h +++ /dev/null @@ -1,470 +0,0 @@ -/* - * Copyright (C) 2011 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BINARY_FORMAT_H -#define LATINIME_BINARY_FORMAT_H - -#include <stdint.h> - -#include "suggest/core/dictionary/probability_utils.h" -#include "utils/char_utils.h" - -namespace latinime { - -class BinaryFormat { - public: - // Mask and flags for children address type selection. - static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; - - // Flag for single/multiple char group - static const int FLAG_HAS_MULTIPLE_CHARS = 0x20; - - // Flag for terminal groups - static const int FLAG_IS_TERMINAL = 0x10; - - // Flag for shortcut targets presence - static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08; - // Flag for bigram presence - static const int FLAG_HAS_BIGRAMS = 0x04; - // Flag for non-words (typically, shortcut only entries) - static const int FLAG_IS_NOT_A_WORD = 0x02; - // Flag for blacklist - static const int FLAG_IS_BLACKLISTED = 0x01; - - // Attribute (bigram/shortcut) related flags: - // Flag for presence of more attributes - static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; - // Flag for sign of offset. If this flag is set, the offset value must be negated. - static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; - - // Mask for attribute probability, stored on 4 bits inside the flags byte. - static const int MASK_ATTRIBUTE_PROBABILITY = 0x0F; - - // Mask and flags for attribute address type selection. - static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; - - static int getGroupCountAndForwardPointer(const uint8_t *const dict, int *pos); - static uint8_t getFlagsAndForwardPointer(const uint8_t *const dict, int *pos); - static int getCodePointAndForwardPointer(const uint8_t *const dict, int *pos); - static int readProbabilityWithoutMovingPointer(const uint8_t *const dict, const int pos); - static int skipOtherCharacters(const uint8_t *const dict, const int pos); - static int skipChildrenPosition(const uint8_t flags, const int pos); - static int skipProbability(const uint8_t flags, const int pos); - static int skipShortcuts(const uint8_t *const dict, const uint8_t flags, const int pos); - static int skipChildrenPosAndAttributes(const uint8_t *const dict, const uint8_t flags, - const int pos); - static int readChildrenPosition(const uint8_t *const dict, const uint8_t flags, const int pos); - static bool hasChildrenInFlags(const uint8_t flags); - static int getTerminalPosition(const uint8_t *const root, const int *const inWord, - const int length, const bool forceLowerCaseSearch); - static int getCodePointsAndProbabilityAndReturnCodePointCount( - const uint8_t *const root, const int nodePos, const int maxCodePointCount, - int *const outCodePoints, int *const outUnigramProbability); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryFormat); - - static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; - static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; - static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; - static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; - static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; - - static const int CHARACTER_ARRAY_TERMINATOR_SIZE = 1; - static const int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; - static const int CHARACTER_ARRAY_TERMINATOR = 0x1F; - static const int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; - static const int NO_FLAGS = 0; - static int skipAllAttributes(const uint8_t *const dict, const uint8_t flags, const int pos); - static int skipBigrams(const uint8_t *const dict, const uint8_t flags, const int pos); -}; - -AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *const dict, - int *pos) { - const int msb = dict[(*pos)++]; - if (msb < 0x80) return msb; - return ((msb & 0x7F) << 8) | dict[(*pos)++]; -} - -inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) { - return dict[(*pos)++]; -} - -AK_FORCE_INLINE int BinaryFormat::getCodePointAndForwardPointer(const uint8_t *const dict, - int *pos) { - const int origin = *pos; - const int codePoint = dict[origin]; - if (codePoint < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { - if (codePoint == CHARACTER_ARRAY_TERMINATOR) { - *pos = origin + 1; - return NOT_A_CODE_POINT; - } else { - *pos = origin + 3; - const int char_1 = codePoint << 16; - const int char_2 = char_1 + (dict[origin + 1] << 8); - return char_2 + dict[origin + 2]; - } - } else { - *pos = origin + 1; - return codePoint; - } -} - -inline int BinaryFormat::readProbabilityWithoutMovingPointer(const uint8_t *const dict, - const int pos) { - return dict[pos]; -} - -AK_FORCE_INLINE int BinaryFormat::skipOtherCharacters(const uint8_t *const dict, const int pos) { - int currentPos = pos; - int character = dict[currentPos++]; - while (CHARACTER_ARRAY_TERMINATOR != character) { - if (character < MINIMAL_ONE_BYTE_CHARACTER_VALUE) { - currentPos += MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE; - } - character = dict[currentPos++]; - } - return currentPos; -} - -static inline int attributeAddressSize(const uint8_t flags) { - static const int ATTRIBUTE_ADDRESS_SHIFT = 4; - return (flags & BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; - /* Note: this is a value-dependant optimization of what may probably be - more readably written this way: - switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; - default: return 0; - } - */ -} - -static AK_FORCE_INLINE int skipExistingBigrams(const uint8_t *const dict, const int pos) { - int currentPos = pos; - uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); - while (flags & BinaryFormat::FLAG_ATTRIBUTE_HAS_NEXT) { - currentPos += attributeAddressSize(flags); - flags = BinaryFormat::getFlagsAndForwardPointer(dict, ¤tPos); - } - currentPos += attributeAddressSize(flags); - return currentPos; -} - -static inline int childrenAddressSize(const uint8_t flags) { - static const int CHILDREN_ADDRESS_SHIFT = 6; - return (BinaryFormat::MASK_GROUP_ADDRESS_TYPE & flags) >> CHILDREN_ADDRESS_SHIFT; - /* See the note in attributeAddressSize. The same applies here */ -} - -static AK_FORCE_INLINE int shortcutByteSize(const uint8_t *const dict, const int pos) { - return (static_cast<int>(dict[pos] << 8)) + (dict[pos + 1]); -} - -inline int BinaryFormat::skipChildrenPosition(const uint8_t flags, const int pos) { - return pos + childrenAddressSize(flags); -} - -inline int BinaryFormat::skipProbability(const uint8_t flags, const int pos) { - return FLAG_IS_TERMINAL & flags ? pos + 1 : pos; -} - -AK_FORCE_INLINE int BinaryFormat::skipShortcuts(const uint8_t *const dict, const uint8_t flags, - const int pos) { - if (FLAG_HAS_SHORTCUT_TARGETS & flags) { - return pos + shortcutByteSize(dict, pos); - } else { - return pos; - } -} - -AK_FORCE_INLINE int BinaryFormat::skipBigrams(const uint8_t *const dict, const uint8_t flags, - const int pos) { - if (FLAG_HAS_BIGRAMS & flags) { - return skipExistingBigrams(dict, pos); - } else { - return pos; - } -} - -AK_FORCE_INLINE int BinaryFormat::skipAllAttributes(const uint8_t *const dict, const uint8_t flags, - const int pos) { - // This function skips all attributes: shortcuts and bigrams. - int newPos = pos; - newPos = skipShortcuts(dict, flags, newPos); - newPos = skipBigrams(dict, flags, newPos); - return newPos; -} - -AK_FORCE_INLINE int BinaryFormat::skipChildrenPosAndAttributes(const uint8_t *const dict, - const uint8_t flags, const int pos) { - int currentPos = pos; - currentPos = skipChildrenPosition(flags, currentPos); - currentPos = skipAllAttributes(dict, flags, currentPos); - return currentPos; -} - -AK_FORCE_INLINE int BinaryFormat::readChildrenPosition(const uint8_t *const dict, - const uint8_t flags, const int pos) { - int offset = 0; - switch (MASK_GROUP_ADDRESS_TYPE & flags) { - case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: - offset = dict[pos]; - break; - case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: - offset = dict[pos] << 8; - offset += dict[pos + 1]; - break; - case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: - offset = dict[pos] << 16; - offset += dict[pos + 1] << 8; - offset += dict[pos + 2]; - break; - default: - // If we come here, it means we asked for the children of a word with - // no children. - return -1; - } - return pos + offset; -} - -inline bool BinaryFormat::hasChildrenInFlags(const uint8_t flags) { - return (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags)); -} - -// This function gets the byte position of the last chargroup of the exact matching word in the -// dictionary. If no match is found, it returns NOT_A_VALID_WORD_POS. -AK_FORCE_INLINE int BinaryFormat::getTerminalPosition(const uint8_t *const root, - const int *const inWord, const int length, const bool forceLowerCaseSearch) { - int pos = 0; - int wordPos = 0; - - while (true) { - // If we already traversed the tree further than the word is long, there means - // there was no match (or we would have found it). - if (wordPos >= length) return NOT_A_VALID_WORD_POS; - int charGroupCount = BinaryFormat::getGroupCountAndForwardPointer(root, &pos); - const int wChar = forceLowerCaseSearch - ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; - while (true) { - // If there are no more character groups in this node, it means we could not - // find a matching character for this depth, therefore there is no match. - if (0 >= charGroupCount) return NOT_A_VALID_WORD_POS; - const int charGroupPos = pos; - const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); - int character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - if (character == wChar) { - // This is the correct node. Only one character group may start with the same - // char within a node, so either we found our match in this node, or there is - // no match and we can return NOT_A_VALID_WORD_POS. So we will check all the - // characters in this character group indeed does match. - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - while (NOT_A_CODE_POINT != character) { - ++wordPos; - // If we shoot the length of the word we search for, or if we find a single - // character that does not match, as explained above, it means the word is - // not in the dictionary (by virtue of this chargroup being the only one to - // match the word on the first character, but not matching the whole word). - if (wordPos >= length) return NOT_A_VALID_WORD_POS; - if (inWord[wordPos] != character) return NOT_A_VALID_WORD_POS; - character = BinaryFormat::getCodePointAndForwardPointer(root, &pos); - } - } - // If we come here we know that so far, we do match. Either we are on a terminal - // and we match the length, in which case we found it, or we traverse children. - // If we don't match the length AND don't have children, then a word in the - // dictionary fully matches a prefix of the searched word but not the full word. - ++wordPos; - if (FLAG_IS_TERMINAL & flags) { - if (wordPos == length) { - return charGroupPos; - } - pos = BinaryFormat::skipProbability(FLAG_IS_TERMINAL, pos); - } - if (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS == (MASK_GROUP_ADDRESS_TYPE & flags)) { - return NOT_A_VALID_WORD_POS; - } - // We have children and we are still shorter than the word we are searching for, so - // we need to traverse children. Put the pointer on the children position, and - // break - pos = BinaryFormat::readChildrenPosition(root, flags, pos); - break; - } else { - // This chargroup does not match, so skip the remaining part and go to the next. - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - pos = BinaryFormat::skipOtherCharacters(root, pos); - } - pos = BinaryFormat::skipProbability(flags, pos); - pos = BinaryFormat::skipChildrenPosAndAttributes(root, flags, pos); - } - --charGroupCount; - } - } -} - -// This function searches for a terminal in the dictionary by its address. -// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, -// it is possible to check for this with advantageous complexity. For each node, we search -// for groups with children and compare the children address with the address we look for. -// When we shoot the address we look for, it means the word we look for is in the children -// of the previous group. The only tricky part is the fact that if we arrive at the end of a -// node with the last group's children address still less than what we are searching for, we -// must descend the last group's children (for example, if the word we are searching for starts -// with a z, it's the last group of the root node, so all children addresses will be smaller -// than the address we look for, and we have to descend the z node). -/* Parameters : - * root: the dictionary buffer - * address: the byte position of the last chargroup of the word we are searching for (this is - * what is stored as the "bigram address" in each bigram) - * outword: an array to write the found word, with MAX_WORD_LENGTH size. - * outUnigramProbability: a pointer to an int to write the probability into. - * Return value : the length of the word, of 0 if the word was not found. - */ -AK_FORCE_INLINE int BinaryFormat::getCodePointsAndProbabilityAndReturnCodePointCount( - const uint8_t *const root, const int nodePos, const int maxCodePointCount, - int *const outCodePoints, int *const outUnigramProbability) { - int pos = 0; - int wordPos = 0; - - // One iteration of the outer loop iterates through nodes. As stated above, we will only - // traverse nodes that are actually a part of the terminal we are searching, so each time - // we enter this loop we are one depth level further than last time. - // The only reason we count nodes is because we want to reduce the probability of infinite - // looping in case there is a bug. Since we know there is an upper bound to the depth we are - // supposed to traverse, it does not hurt to count iterations. - for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { - int lastCandidateGroupPos = 0; - // Let's loop through char groups in this node searching for either the terminal - // or one of its ascendants. - for (int charGroupCount = getGroupCountAndForwardPointer(root, &pos); charGroupCount > 0; - --charGroupCount) { - const int startPos = pos; - const uint8_t flags = getFlagsAndForwardPointer(root, &pos); - const int character = getCodePointAndForwardPointer(root, &pos); - if (nodePos == startPos) { - // We found the address. Copy the rest of the word in the buffer and return - // the length. - outCodePoints[wordPos] = character; - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - int nextChar = getCodePointAndForwardPointer(root, &pos); - // We count chars in order to avoid infinite loops if the file is broken or - // if there is some other bug - int charCount = maxCodePointCount; - while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { - outCodePoints[++wordPos] = nextChar; - nextChar = getCodePointAndForwardPointer(root, &pos); - } - } - *outUnigramProbability = readProbabilityWithoutMovingPointer(root, pos); - return ++wordPos; - } - // We need to skip past this char group, so skip any remaining chars after the - // first and possibly the probability. - if (FLAG_HAS_MULTIPLE_CHARS & flags) { - pos = skipOtherCharacters(root, pos); - } - pos = skipProbability(flags, pos); - - // The fact that this group has children is very important. Since we already know - // that this group does not match, if it has no children we know it is irrelevant - // to what we are searching for. - const bool hasChildren = (FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != - (MASK_GROUP_ADDRESS_TYPE & flags)); - // We will write in `found' whether we have passed the children address we are - // searching for. For example if we search for "beer", the children of b are less - // than the address we are searching for and the children of c are greater. When we - // come here for c, we realize this is too big, and that we should descend b. - bool found; - if (hasChildren) { - // Here comes the tricky part. First, read the children position. - const int childrenPos = readChildrenPosition(root, flags, pos); - if (childrenPos > nodePos) { - // If the children pos is greater than address, it means the previous chargroup, - // which address is stored in lastCandidateGroupPos, was the right one. - found = true; - } else if (1 >= charGroupCount) { - // However if we are on the LAST group of this node, and we have NOT shot the - // address we should descend THIS node. So we trick the lastCandidateGroupPos - // so that we will descend this node, not the previous one. - lastCandidateGroupPos = startPos; - found = true; - } else { - // Else, we should continue looking. - found = false; - } - } else { - // Even if we don't have children here, we could still be on the last group of this - // node. If this is the case, we should descend the last group that had children, - // and their address is already in lastCandidateGroup. - found = (1 >= charGroupCount); - } - - if (found) { - // Okay, we found the group we should descend. Its address is in - // the lastCandidateGroupPos variable, so we just re-read it. - if (0 != lastCandidateGroupPos) { - const uint8_t lastFlags = - getFlagsAndForwardPointer(root, &lastCandidateGroupPos); - const int lastChar = - getCodePointAndForwardPointer(root, &lastCandidateGroupPos); - // We copy all the characters in this group to the buffer - outCodePoints[wordPos] = lastChar; - if (FLAG_HAS_MULTIPLE_CHARS & lastFlags) { - int nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); - int charCount = maxCodePointCount; - while (-1 != nextChar && --charCount > 0) { - outCodePoints[++wordPos] = nextChar; - nextChar = getCodePointAndForwardPointer(root, &lastCandidateGroupPos); - } - } - ++wordPos; - // Now we only need to branch to the children address. Skip the probability if - // it's there, read pos, and break to resume the search at pos. - lastCandidateGroupPos = skipProbability(lastFlags, lastCandidateGroupPos); - pos = readChildrenPosition(root, lastFlags, lastCandidateGroupPos); - break; - } else { - // Here is a little tricky part: we come here if we found out that all children - // addresses in this group are bigger than the address we are searching for. - // Should we conclude the word is not in the dictionary? No! It could still be - // one of the remaining chargroups in this node, so we have to keep looking in - // this node until we find it (or we realize it's not there either, in which - // case it's actually not in the dictionary). Pass the end of this group, ready - // to start the next one. - pos = skipChildrenPosAndAttributes(root, flags, pos); - } - } else { - // If we did not find it, we should record the last children address for the next - // iteration. - if (hasChildren) lastCandidateGroupPos = startPos; - // Now skip the end of this group (children pos and the attributes if any) so that - // our pos is after the end of this char group, at the start of the next one. - pos = skipChildrenPosAndAttributes(root, flags, pos); - } - - } - } - // If we have looked through all the chargroups and found no match, the address is - // not the address of a terminal in this dictionary. - return 0; -} - -} // namespace latinime -#endif // LATINIME_BINARY_FORMAT_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h deleted file mode 100644 index c0df89f49..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_policy_factory.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H -#define LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_format_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" -#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" - -namespace latinime { - -class DictionaryStructurePolicy; - -class DictionaryStructurePolicyFactory { - public: - static const DictionaryStructurePolicy *getDictionaryStructurePolicy( - const BinaryDictionaryFormatUtils::FORMAT_VERSION dictionaryFormat) { - switch (dictionaryFormat) { - case BinaryDictionaryFormatUtils::VERSION_2: - return PatriciaTriePolicy::getInstance(); - case BinaryDictionaryFormatUtils::VERSION_3: - return DynamicPatriciaTriePolicy::getInstance(); - default: - ASSERT(false); - return 0; - } - } - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructurePolicyFactory); -}; -} // namespace latinime -#endif // LATINIME_DICTIONARY_STRUCTURE_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 000000000..434858d67 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy *DictionaryStructureWithBufferPolicyFactory + ::newDictionaryStructureWithBufferPolicy(const char *const path, const int pathLength, + const int bufOffset, const int size, const bool isUpdatable) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // impl classes of DictionaryStructureWithBufferPolicy. + const MmappedBuffer *const mmapedBuffer = MmappedBuffer::openBuffer(path, pathLength, bufOffset, + size, isUpdatable); + if (!mmapedBuffer) { + return 0; + } + switch (FormatUtils::detectFormatVersion(mmapedBuffer->getBuffer(), + mmapedBuffer->getBufferSize())) { + case FormatUtils::VERSION_2: + return new PatriciaTriePolicy(mmapedBuffer); + case FormatUtils::VERSION_3: + return new DynamicPatriciaTriePolicy(mmapedBuffer); + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number"); + delete mmapedBuffer; + ASSERT(false); + return 0; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h new file mode 100644 index 000000000..1cb7a89c4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicyFactory { + public: + static DictionaryStructureWithBufferPolicy *newDictionaryStructureWithBufferPolicy( + const char *const path, const int pathLength, const int bufOffset, const int size, + const bool isUpdatable); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp index 7ac635a00..6bb90fc2d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp @@ -16,48 +16,54 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(const int nodePos, const int maxCodePointCount, int *const outCodePoints) { - const uint8_t *const dictRoot = mBinaryDictionaryInfo->getDictRoot(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(nodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); int pos = nodePos; - mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); const int parentPos = - DynamicPatriciaTrieReadingUtils::getParentPosAndAdvancePosition(dictRoot, &pos); + DynamicPatriciaTrieReadingUtils::getParentPosAndAdvancePosition(dictBuf, &pos); mParentPos = (parentPos != 0) ? mNodePos + parentPos : NOT_A_DICT_POS; if (outCodePoints != 0) { mCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictRoot, mFlags, maxCodePointCount, outCodePoints, &pos); + dictBuf, mFlags, maxCodePointCount, outCodePoints, &pos); } else { mCodePointCount = PatriciaTrieReadingUtils::skipCharacters( - dictRoot, mFlags, MAX_WORD_LENGTH, &pos); + dictBuf, mFlags, MAX_WORD_LENGTH, &pos); } if (isTerminal()) { - mProbability = PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos); + mProbability = PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictBuf, &pos); } else { mProbability = NOT_A_PROBABILITY; } - if (hasChildren()) { - mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - dictRoot, mFlags, &pos); - } else { - mChildrenPos = NOT_A_DICT_POS; + mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, mFlags, &pos); + if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { + mChildrenPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); } if (PatriciaTrieReadingUtils::hasShortcutTargets(mFlags)) { mShortcutPos = pos; - BinaryDictionaryTerminalAttributesReadingUtils::skipShortcuts(mBinaryDictionaryInfo, &pos); + mShortcutsPolicy->skipAllShortcuts(&pos); } else { mShortcutPos = NOT_A_DICT_POS; } if (PatriciaTrieReadingUtils::hasBigrams(mFlags)) { mBigramPos = pos; - BinaryDictionaryTerminalAttributesReadingUtils::skipExistingBigrams( - mBinaryDictionaryInfo, &pos); + mBigramsPolicy->skipAllBigrams(&pos); } else { mBigramPos = NOT_A_DICT_POS; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h index 71558edaa..acc68b321 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h @@ -17,13 +17,17 @@ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H +#include <stdint.h> + #include "defines.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { -class BinaryDictionaryInfo; +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; /* * This class is used for helping to read nodes of dynamic patricia trie. This class handles moved @@ -31,12 +35,14 @@ class BinaryDictionaryInfo; */ class DynamicPatriciaTrieNodeReader { public: - explicit DynamicPatriciaTrieNodeReader(const BinaryDictionaryInfo *const binaryDictionaryInfo) - : mBinaryDictionaryInfo(binaryDictionaryInfo), mNodePos(NOT_A_VALID_WORD_POS), - mFlags(0), mParentPos(NOT_A_DICT_POS), mCodePointCount(0), - mProbability(NOT_A_PROBABILITY), mChildrenPos(NOT_A_DICT_POS), - mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), - mSiblingPos(NOT_A_VALID_WORD_POS) {} + DynamicPatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mBuffer(buffer), mBigramsPolicy(bigramsPolicy), + mShortcutsPolicy(shortcutsPolicy), mNodePos(NOT_A_VALID_WORD_POS), mFlags(0), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mProbability(NOT_A_PROBABILITY), + mChildrenPos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mSiblingPos(NOT_A_VALID_WORD_POS) {} ~DynamicPatriciaTrieNodeReader() {} @@ -63,7 +69,7 @@ class DynamicPatriciaTrieNodeReader { } AK_FORCE_INLINE bool hasChildren() const { - return PatriciaTrieReadingUtils::hasChildrenInFlags(mFlags); + return mChildrenPos != NOT_A_DICT_POS; } AK_FORCE_INLINE bool isTerminal() const { @@ -116,7 +122,9 @@ class DynamicPatriciaTrieNodeReader { private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeReader); - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; + const BufferWithExtendableBuffer *const mBuffer; + const DictionaryBigramsStructurePolicy *const mBigramsPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutsPolicy; int mNodePos; DynamicPatriciaTrieReadingUtils::NodeFlags mFlags; int mParentPos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 3df505688..3b9878b82 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -19,188 +19,127 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { -const DynamicPatriciaTriePolicy DynamicPatriciaTriePolicy::sInstance; -// To avoid infinite loop caused by invalid or malicious forward links. -const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; - void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const { + DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int nextPos = dicNode->getChildrenPos(); - int totalChildCount = 0; - do { - const int childCount = PatriciaTrieReadingUtils::getGroupCountAndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), &nextPos); - totalChildCount += childCount; - if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", - childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); - return; - } - for (int i = 0; i < childCount; i++) { - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH, - mergedNodeCodePoints); - if (!nodeReader.isDeleted() && !nodeFilter->isFilteredOut(mergedNodeCodePoints[0])) { - // Push child node when the node is not deleted and not filtered out. - childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(), - nodeReader.getChildrenPos(), nodeReader.getProbability(), - nodeReader.isTerminal(), nodeReader.hasChildren(), - nodeReader.isBlacklisted() || nodeReader.isNotAWord(), - nodeReader.getCodePointCount(), mergedNodeCodePoints); - } - nextPos = nodeReader.getSiblingNodePos(); - } - nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition( - binaryDictionaryInfo->getDictRoot(), nextPos); - } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos)); + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(), + nodeReader->getChildrenPos(), nodeReader->getProbability(), + nodeReader->isTerminal() && !nodeReader->isDeleted(), + nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), + nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); + readingHelper.readNextSiblingNode(); + } } int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { - if (nodePos == NOT_A_VALID_WORD_POS) { - *outUnigramProbability = NOT_A_PROBABILITY; - return 0; - } // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. int reverseCodePoints[maxCodePointCount]; - int mergedNodeCodePoints[maxCodePointCount]; - int codePointCount = 0; - - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); - // First, read terminal node and get its probability. - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount, - mergedNodeCodePoints); + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + // First, read the terminal node and get its probability. + readingHelper.initWithNodePos(nodePos); + if (!readingHelper.isValidTerminalNode()) { + // Node at the nodePos is not a valid terminal node. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } // Store terminal node probability. - *outUnigramProbability = nodeReader.getProbability(); - // Store terminal node code points to buffer in the reverse order. - for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { - reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; - } - // Then, follow parent pos toward the root node. - while (nodeReader.getParentPos() != NOT_A_DICT_POS) { - // codePointCount must be incremented at least once in each iteration to ensure preventing - // infinite loop. - if (nodeReader.isDeleted() || codePointCount > maxCodePointCount - || nodeReader.getCodePointCount() <= 0) { + *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); + // Then, following parent node link to the dictionary root and fetch node code points. + while (!readingHelper.isEnd()) { + if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { // The nodePos is not a valid terminal node position in the dictionary. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } - // Read parent node. - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(), - maxCodePointCount, mergedNodeCodePoints); // Store node code points to buffer in the reverse order. - for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { - reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; - } + readingHelper.fetchMergedNodeCodePointsInReverseOrder( + readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); + // Follow parent node toward the root node. + readingHelper.readParentNode(); + } + if (readingHelper.isError()) { + // The node position or the dictionary is invalid. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; } // Reverse the stored code points to output them. + const int codePointCount = readingHelper.getTotalCodePointCount(); for (int i = 0; i < codePointCount; ++i) { outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; } return codePointCount; } -int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, +int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const { int searchCodePoints[length]; for (int i = 0; i < length; ++i) { searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; } - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int currentLength = 0; - int pos = getRootPosition(); - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); - while (currentLength <= length) { - // When foundMatchedNode becomes true, currentLength is increased at least once. - bool foundMatchedNode = false; - int totalChildCount = 0; - do { - const int childCount = PatriciaTrieReadingUtils::getGroupCountAndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), &pos); - totalChildCount += childCount; - if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", - childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithNodeArrayPos(getRootPosition()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); + if (readingHelper.getTotalCodePointCount() > length + || !readingHelper.isMatchedCodePoint(0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readingHelper.readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!readingHelper.isMatchedCodePoint( + j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. return NOT_A_VALID_WORD_POS; } - for (int i = 0; i < childCount; i++) { - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH, - mergedNodeCodePoints); - if (nodeReader.isDeleted() || nodeReader.getCodePointCount() <= 0) { - // Skip deleted or empty node. - pos = nodeReader.getSiblingNodePos(); - continue; - } - bool matched = true; - for (int j = 0; j < nodeReader.getCodePointCount(); ++j) { - if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) { - // Different code point is found. - matched = false; - break; - } - } - if (matched) { - currentLength += nodeReader.getCodePointCount(); - if (length == currentLength) { - // Terminal position is found. - return nodeReader.getNodePos(); - } - if (!nodeReader.hasChildren()) { - return NOT_A_VALID_WORD_POS; - } - foundMatchedNode = true; - // Advance to the children nodes. - pos = nodeReader.getChildrenPos(); - break; - } - // Try next sibling node. - pos = nodeReader.getSiblingNodePos(); - } - if (foundMatchedNode) { - break; - } - // If the matched node is not found in the current node group, try to follow the - // forward link. - pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition( - binaryDictionaryInfo->getDictRoot(), pos); - } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos)); - if (!foundMatchedNode) { - // Matched node is not found. + } + // All characters are matched. + if (length == readingHelper.getTotalCodePointCount()) { + // Terminal position is found. + return nodeReader->getNodePos(); + } + if (!nodeReader->hasChildren()) { return NOT_A_VALID_WORD_POS; } + // Advance to the children nodes. + readingHelper.readChildNode(); } // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). return NOT_A_VALID_WORD_POS; } -int DynamicPatriciaTriePolicy::getUnigramProbability( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) const { +int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_PROBABILITY; } - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { return NOT_A_PROBABILITY; @@ -208,13 +147,12 @@ int DynamicPatriciaTriePolicy::getUnigramProbability( return nodeReader.getProbability(); } -int DynamicPatriciaTriePolicy::getShortcutPositionOfNode( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const { +int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted()) { return NOT_A_DICT_POS; @@ -222,13 +160,12 @@ int DynamicPatriciaTriePolicy::getShortcutPositionOfNode( return nodeReader.getShortcutPos(); } -int DynamicPatriciaTriePolicy::getBigramsPositionOfNode( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const { +int DynamicPatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - DynamicPatriciaTrieNodeReader nodeReader(binaryDictionaryInfo); + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted()) { return NOT_A_DICT_POS; @@ -236,4 +173,60 @@ int DynamicPatriciaTriePolicy::getBigramsPositionOfNode( return nodeReader.getBigramsPos(); } +bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, + const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithNodeArrayPos(getRootPosition()); + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addUnigramWord(&readingHelper, word, length, probability); +} + +bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1, const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_VALID_WORD_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_VALID_WORD_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addBigramWords(word0Pos, word1Pos, probability); +} + +bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_VALID_WORD_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_VALID_WORD_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.removeBigramWords(word0Pos, word1Pos); +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 6a7977138..5873d3d65 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -18,18 +18,29 @@ #define LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H #include "defines.h" -#include "suggest/core/policy/dictionary_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" namespace latinime { -class BinaryDictionaryInfo; class DicNode; class DicNodeVector; -class DynamicPatriciaTriePolicy : public DictionaryStructurePolicy { +class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: - static AK_FORCE_INLINE const DynamicPatriciaTriePolicy *getInstance() { - return &sInstance; + DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()), + mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), + mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mBigramListPolicy(&mBufferWithExtendableBuffer), + mShortcutListPolicy(&mBufferWithExtendableBuffer) {} + + ~DynamicPatriciaTriePolicy() { + delete mBuffer; } AK_FORCE_INLINE int getRootPosition() const { @@ -37,34 +48,49 @@ class DynamicPatriciaTriePolicy : public DictionaryStructurePolicy { } void createAndGetAllChildNodes(const DicNode *const dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const; + DicNodeVector *const childDicNodes) const; int getCodePointsAndProbabilityAndReturnCodePointCount( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const; - int getTerminalNodePositionOfWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, + int getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const; - int getUnigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + int getUnigramProbability(const int nodePos) const; + + int getShortcutPositionOfNode(const int nodePos) const; + + int getBigramsPositionOfNode(const int nodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability); - int getShortcutPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability); - int getBigramsPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1); private: - DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTriePolicy); - static const DynamicPatriciaTriePolicy sInstance; - static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); - DynamicPatriciaTriePolicy() {} - ~DynamicPatriciaTriePolicy() {} + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mBufferWithExtendableBuffer; + DynamicBigramListPolicy mBigramListPolicy; + DynamicShortcutListPolicy mShortcutListPolicy; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp new file mode 100644 index 000000000..2042fcbd2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPatriciaTrieReadingHelper::nextNodeArray() { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mPos -= mBuffer->getOriginalBufferSize(); + } + mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf, + &mPos); + if (usesAdditionalBuffer) { + mPos += mBuffer->getOriginalBufferSize(); + } + // Count up nodes and node arrays to avoid infinite loop. + mTotalNodeCount += mNodeCount; + mNodeArrayCount++; + if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, + mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mPos = NOT_A_DICT_POS; + return; + } + if (mNodeCount == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPatriciaTrieReadingHelper::followForwardLink() { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mPos -= mBuffer->getOriginalBufferSize(); + } + const int forwardLinkPosition = + DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos); + if (usesAdditionalBuffer) { + mPos += mBuffer->getOriginalBufferSize(); + } + if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { + // Follow the forward link. + mPos = forwardLinkPosition; + nextNodeArray(); + } else { + // All node arrays have been read. + mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h new file mode 100644 index 000000000..b108ed5fb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPatriciaTrieReadingHelper { + public: + DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), + mTotalNodeCount(0), mNodeArrayCount(0), mBuffer(buffer), + mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy) {} + + ~DynamicPatriciaTrieReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a node array. + AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) { + if (nodeArrayPos == NOT_A_DICT_POS) { + mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mPos = nodeArrayPos; + mNodeCount = 0; + mPrevTotalCodePointCount = 0; + mTotalNodeCount = 0; + mNodeArrayCount = 0; + nextNodeArray(); + if (!isEnd()) { + fetchNodeInfo(); + } + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithNodePos(const int nodePos) { + // TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS + if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) { + mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mPos = nodePos; + mNodeCount = 1; + mPrevTotalCodePointCount = 0; + mTotalNodeCount = 1; + mNodeArrayCount = 1; + fetchNodeInfo(); + } + } + + AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { + return &mNodeReader; + } + + AK_FORCE_INLINE bool isValidTerminalNode() const { + return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { + return mMergedNodeCodePoints[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + return mPrevTotalCodePointCount; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE int getTotalCodePointCount() const { + return mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = mNodeReader.getCodePointCount(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { + return mMergedNodeCodePoints; + } + + AK_FORCE_INLINE void readNextSiblingNode() { + mNodeCount -= 1; + mPos = mNodeReader.getSiblingNodePos(); + if (mNodeCount <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + if (!isEnd()) { + fetchNodeInfo(); + } + } else { + fetchNodeInfo(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode() { + if (mNodeReader.hasChildren()) { + mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mTotalNodeCount = 0; + mNodeArrayCount = 0; + mPos = mNodeReader.getChildrenPos(); + // Read children node array. + nextNodeArray(); + if (!isEnd()) { + fetchNodeInfo(); + } + } else { + mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode() { + if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { + mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mTotalNodeCount = 1; + mNodeArrayCount = 1; + mNodeCount = 1; + mPos = mNodeReader.getParentPos(); + fetchNodeInfo(); + } else { + mPos = NOT_A_DICT_POS; + } + } + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + + bool mIsError; + int mPos; + // Node count of a node array. + int mNodeCount; + int mPrevTotalCodePointCount; + int mTotalNodeCount; + int mNodeArrayCount; + const BufferWithExtendableBuffer *const mBuffer; + DynamicPatriciaTrieNodeReader mNodeReader; + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + + void nextNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void fetchNodeInfo() { + mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH, + mMergedNodeCodePoints); + if (mNodeReader.getCodePointCount() <= 0) { + // Empty node is not allowed. + mIsError = true; + mPos = NOT_A_DICT_POS; + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp index 0de6341b0..5d979fa51 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp @@ -17,7 +17,7 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "defines.h" -#include "suggest/core/dictionary/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { @@ -32,7 +32,13 @@ const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; const uint8_t *const buffer, const NodeFlags flags, int *const pos) { if ((flags & MASK_MOVED) == FLAG_IS_NOT_MOVED) { const int base = *pos; - return base + ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == 0) { + // 0 offset means that the node does not have children. + return NOT_A_DICT_POS; + } else { + return base + offset; + } } else { return NOT_A_DICT_POS; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h index 5398d7e37..a6cb46d39 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h @@ -20,7 +20,7 @@ #include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..128d69d88 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" + +namespace latinime { + +bool DynamicPatriciaTrieWritingHelper::addUnigramWord( + DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability) { + while (!readingHelper->isEnd()) { + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper->getNodeReader(); + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount) { + // TODO: split current node after j - 1, create child and make this terminal. + return false; + } + if (!readingHelper->isMatchedCodePoint(j, + wordCodePoints[matchedCodePointCount + j])) { + // TODO: split current node after j - 1 and create two children. + return false; + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount()) { + if (nodeReader->isTerminal()) { + // TODO: Update probability. + } else { + // TODO: Make it terminal and update probability. + } + return false; + } + if (!nodeReader->hasChildren()) { + // TODO: Create children node array and add new node as a child. + return false; + } + // Advance to the children nodes. + readingHelper->readChildNode(); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + // TODO: add at the last position of the node array. + return false; +} + +bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoFromBuffer(word0Pos); + if (nodeReader.isDeleted()) { + return false; + } + // TODO: Implement. + return false; +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoFromBuffer(word0Pos); + if (nodeReader.isDeleted() || nodeReader.getBigramsPos() == NOT_A_DICT_POS) { + return false; + } + // TODO: Implement. + return false; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h new file mode 100644 index 000000000..f8165fc3d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicBigramListPolicy; +class DynamicPatriciaTrieReadingHelper; +class DynamicShortcutListPolicy; + +class DynamicPatriciaTrieWritingHelper { + public: + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy *const bigramPolicy, + DynamicShortcutListPolicy *const shortcutPolicy) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + + ~DynamicPatriciaTrieWritingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); + + BufferWithExtendableBuffer *const mBuffer; + DynamicBigramListPolicy *const mBigramPolicy; + DynamicShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp new file mode 100644 index 000000000..eb828b58c --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_policy.h" + +namespace latinime { + +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = + "MULTIPLE_WORDS_DEMOTION_RATE"; +const float HeaderPolicy::DEFAULT_MULTI_WORD_COST_MULTIPLIER = 1.0f; +const float HeaderPolicy::MULTI_WORD_COST_MULTIPLIER_SCALE = 100.0f; + +float HeaderPolicy::readMultiWordCostMultiplier() const { + const int headerValue = HeaderReadingUtils::readHeaderValueInt( + mDictBuf, MULTIPLE_WORDS_DEMOTION_RATE_KEY); + if (headerValue == S_INT_MIN) { + // not found + return DEFAULT_MULTI_WORD_COST_MULTIPLIER; + } + if (headerValue <= 0) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + return MULTI_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(headerValue); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 240512bce..e3e6fc077 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_header.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -14,38 +14,40 @@ * limitations under the License. */ -#ifndef LATINIME_BINARY_DICTIONARY_HEADER_H -#define LATINIME_BINARY_DICTIONARY_HEADER_H +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_header_reading_utils.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/header/header_reading_utils.h" namespace latinime { -class BinaryDictionaryInfo; - -/** - * This class abstracts dictionary header structures and provide interface to access dictionary - * header information. - */ -class BinaryDictionaryHeader { +class HeaderPolicy : public DictionaryHeaderStructurePolicy { public: - explicit BinaryDictionaryHeader(const BinaryDictionaryInfo *const binaryDictionaryInfo); + explicit HeaderPolicy(const uint8_t *const dictBuf) + : mDictBuf(dictBuf), mDictionaryFlags(HeaderReadingUtils::getFlags(dictBuf)), + mSize(HeaderReadingUtils::getHeaderSize(dictBuf)), + mMultiWordCostMultiplier(readMultiWordCostMultiplier()) {} + + ~HeaderPolicy() {} AK_FORCE_INLINE int getSize() const { return mSize; } AK_FORCE_INLINE bool supportsDynamicUpdate() const { - return BinaryDictionaryHeaderReadingUtils::supportsDynamicUpdate(mDictionaryFlags); + return HeaderReadingUtils::supportsDynamicUpdate(mDictionaryFlags); } AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { - return BinaryDictionaryHeaderReadingUtils::requiresGermanUmlautProcessing(mDictionaryFlags); + return HeaderReadingUtils::requiresGermanUmlautProcessing(mDictionaryFlags); } AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { - return BinaryDictionaryHeaderReadingUtils::requiresFrenchLigatureProcessing( + return HeaderReadingUtils::requiresFrenchLigatureProcessing( mDictionaryFlags); } @@ -60,7 +62,7 @@ class BinaryDictionaryHeader { outValue[0] = '\0'; return; } - if (!BinaryDictionaryHeaderReadingUtils::readHeaderValue(mBinaryDictionaryInfo, + if (!HeaderReadingUtils::readHeaderValue(mDictBuf, key, outValue, outValueSize)) { outValue[0] = '?'; outValue[1] = '\0'; @@ -68,18 +70,19 @@ class BinaryDictionaryHeader { } private: - DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryHeader); + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const float DEFAULT_MULTI_WORD_COST_MULTIPLIER; static const float MULTI_WORD_COST_MULTIPLIER_SCALE; - const BinaryDictionaryInfo *const mBinaryDictionaryInfo; - const BinaryDictionaryHeaderReadingUtils::DictionaryFlags mDictionaryFlags; + const uint8_t *const mDictBuf; + const HeaderReadingUtils::DictionaryFlags mDictionaryFlags; const int mSize; const float mMultiWordCostMultiplier; float readMultiWordCostMultiplier() const; }; + } // namespace latinime -#endif // LATINIME_BINARY_DICTIONARY_HEADER_H +#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp new file mode 100644 index 000000000..f323876c4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_reading_utils.h" + +#include <cctype> +#include <cstdlib> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const int HeaderReadingUtils::MAX_OPTION_KEY_LENGTH = 256; + +const int HeaderReadingUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadingUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadingUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadingUtils::HEADER_SIZE_FIELD_SIZE = 4; + +const HeaderReadingUtils::DictionaryFlags + HeaderReadingUtils::NO_FLAGS = 0; +// Flags for special processing +// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or +// something very bad (like, the apocalypse) will happen. Please update both at the same time. +const HeaderReadingUtils::DictionaryFlags + HeaderReadingUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; +const HeaderReadingUtils::DictionaryFlags + HeaderReadingUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; +const HeaderReadingUtils::DictionaryFlags + HeaderReadingUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + +/* static */ int HeaderReadingUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadingUtils::DictionaryFlags + HeaderReadingUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +// Returns if the key is found or not and reads the found value into outValue. +/* static */ bool HeaderReadingUtils::readHeaderValue(const uint8_t *const dictBuf, + const char *const key, int *outValue, const int outValueSize) { + if (outValueSize <= 0) { + return false; + } + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return false; + } + while (pos < headerSize) { + if(ByteArrayUtils::compareStringInBufferWithCharArray( + dictBuf, key, headerSize - pos, &pos) == 0) { + // The key was found. + const int length = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, outValueSize, + outValue, &pos); + // Add a 0 terminator to the string. + outValue[length < outValueSize ? length : outValueSize - 1] = '\0'; + return true; + } + ByteArrayUtils::advancePositionToBehindString(dictBuf, headerSize - pos, &pos); + } + // The key was not found. + return false; +} + +/* static */ int HeaderReadingUtils::readHeaderValueInt( + const uint8_t *const dictBuf, const char *const key) { + const int bufferSize = LARGEST_INT_DIGIT_COUNT; + int intBuffer[bufferSize]; + char charBuffer[bufferSize]; + if (!readHeaderValue(dictBuf, key, intBuffer, bufferSize)) { + return S_INT_MIN; + } + for (int i = 0; i < bufferSize; ++i) { + charBuffer[i] = intBuffer[i]; + if (charBuffer[i] == '0') { + break; + } + if (!isdigit(charBuffer[i])) { + // If not a number, return S_INT_MIN + return S_INT_MIN; + } + } + return atoi(charBuffer); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.h new file mode 100644 index 000000000..c94919640 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READING_UTILS_H +#define LATINIME_HEADER_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class HeaderReadingUtils { + public: + typedef uint16_t DictionaryFlags; + + static const int MAX_OPTION_KEY_LENGTH; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE bool supportsDynamicUpdate(const DictionaryFlags flags) { + return (flags & SUPPORTS_DYNAMIC_UPDATE_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresGermanUmlautProcessing(const DictionaryFlags flags) { + return (flags & GERMAN_UMLAUT_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresFrenchLigatureProcessing(const DictionaryFlags flags) { + return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static bool readHeaderValue(const uint8_t *const dictBuf, + const char *const key, int *outValue, const int outValueSize); + + static int readHeaderValueInt(const uint8_t *const dictBuf, const char *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadingUtils); + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const DictionaryFlags NO_FLAGS; + // Flags for special processing + // Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAGS) or + // something very bad (like, the apocalypse) will happen. Please update both at the same time. + static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; + static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; + static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; + static const DictionaryFlags CONTAINS_BIGRAMS_FLAG; +}; +} +#endif /* LATINIME_HEADER_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp index 097f7c86a..adcf2dbdf 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -20,55 +20,299 @@ #include "defines.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/binary_dictionary_info.h" -#include "suggest/core/dictionary/binary_dictionary_terminal_attributes_reading_utils.h" -#include "suggest/policyimpl/dictionary/binary_format.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { -const PatriciaTriePolicy PatriciaTriePolicy::sInstance; - void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const { + DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } int nextPos = dicNode->getChildrenPos(); - const int childCount = PatriciaTrieReadingUtils::getGroupCountAndAdvancePosition( - binaryDictionaryInfo->getDictRoot(), &nextPos); + const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &nextPos); for (int i = 0; i < childCount; i++) { - nextPos = createAndGetLeavingChildNode(dicNode, nextPos, binaryDictionaryInfo, - nodeFilter, childDicNodes); + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); } } +// This retrieves code points and the probability of the word by its terminal position. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each node, we search +// for PtNodes with children and compare the children position with the position we look for. +// When we shoot the position we look for, it means the word we look for is in the children +// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a +// PtNode array with the last PtNode's children position still less than what we are searching for, +// we must descend the last PtNode's children (for example, if the word we are searching for starts +// with a z, it's the last PtNode of the root array, so all children addresses will be smaller +// than the position we look for, and we have to descend the z node). +/* Parameters : + * nodePos: the byte position of the terminal PtNode of the word we are searching for (this is + * what is stored as the "bigram position" in each bigram) + * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramProbability: a pointer to an int to write the probability into. + * Return value : the code point count, of 0 if the word was not found. + */ +// TODO: Split this function to be more readable int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { - return BinaryFormat::getCodePointsAndProbabilityAndReturnCodePointCount( - binaryDictionaryInfo->getDictRoot(), nodePos, - maxCodePointCount, outCodePoints, outUnigramProbability); + int pos = getRootPosition(); + int wordPos = 0; + // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will + // only traverse nodes that are actually a part of the terminal we are searching, so each time + // we enter this loop we are one depth level further than last time. + // The only reason we count nodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { + int lastCandidatePtNodePos = 0; + // Let's loop through PtNodes in this PtNode array searching for either the terminal + // or one of its ascendants. + for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { + const int startPos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + if (nodePos == startPos) { + // We found the position. Copy the rest of the code points in the buffer and return + // the length. + outCodePoints[wordPos] = character; + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + // We count code points in order to avoid infinite loops if the file is broken + // or if there is some other bug + int charCount = maxCodePointCount; + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &pos); + return ++wordPos; + } + // We need to skip past this PtNode, so skip any remaining code points after the + // first and possibly the probability. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + // The fact that this PtNode has children is very important. Since we already know + // that this PtNode does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); + // We will write in `found' whether we have passed the children position we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + int currentPos = pos; + // Here comes the tricky part. First, read the children position. + const int childrenPos = PatriciaTrieReadingUtils + ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); + if (childrenPos > nodePos) { + // If the children pos is greater than the position, it means the previous + // PtNode, which position is stored in lastCandidatePtNodePos, was the right + // one. + found = true; + } else if (1 >= ptNodeCount) { + // However if we are on the LAST PtNode of this array, and we have NOT shot the + // position we should descend THIS node. So we trick the lastCandidatePtNodePos + // so that we will descend this PtNode, not the previous one. + lastCandidatePtNodePos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last PtNode of / + // this array. If this is the case, we should descend the last PtNode that had + // children, and their position is already in lastCandidatePtNodePos. + found = (1 >= ptNodeCount); + } + + if (found) { + // Okay, we found the PtNode we should descend. Its position is in + // the lastCandidatePtNodePos variable, so we just re-read it. + if (0 != lastCandidatePtNodePos) { + const PatriciaTrieReadingUtils::NodeFlags lastFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + // We copy all the characters in this PtNode to the buffer + outCodePoints[wordPos] = lastChar; + if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + int charCount = maxCodePointCount; + while (-1 != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the probability if + // it's there, read pos, and break to resume the search at pos. + if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &lastCandidatePtNodePos); + } + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, lastFlags, &lastCandidatePtNodePos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this PtNode are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining PtNodes in this array, so we have to keep looking in + // this array until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this PtNode, + // ready to start the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidatePtNodePos = startPos; + // Now skip the end of this PtNode (children pos and the attributes if any) so that + // our pos is after the end of this PtNode, at the start of the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + + } + } + // If we have looked through all the PtNodes and found no match, the nodePos is + // not the position of a terminal in this dictionary. + return 0; } -int PatriciaTriePolicy::getTerminalNodePositionOfWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, +// This function gets the position of the terminal node of the exact matching word in the +// dictionary. If no match is found, it returns NOT_A_VALID_WORD_POS. +int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const { - return BinaryFormat::getTerminalPosition(binaryDictionaryInfo->getDictRoot(), inWord, - length, forceLowerCaseSearch); + int pos = getRootPosition(); + int wordPos = 0; + + while (true) { + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + if (wordPos >= length) return NOT_A_VALID_WORD_POS; + int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, + &pos); + const int wChar = forceLowerCaseSearch + ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; + while (true) { + // If there are no more PtNodes in this array, it means we could not + // find a matching character for this depth, therefore there is no match. + if (0 >= ptNodeCount) return NOT_A_VALID_WORD_POS; + const int ptNodePos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + if (character == wChar) { + // This is the correct PtNode. Only one PtNode may start with the same char within + // a PtNode array, so either we found our match in this array, or there is + // no match and we can return NOT_A_VALID_WORD_POS. So we will check all the + // characters in this PtNode indeed does match. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + while (NOT_A_CODE_POINT != character) { + ++wordPos; + // If we shoot the length of the word we search for, or if we find a single + // character that does not match, as explained above, it means the word is + // not in the dictionary (by virtue of this PtNode being the only one to + // match the word on the first character, but not matching the whole word). + if (wordPos >= length) return NOT_A_VALID_WORD_POS; + if (inWord[wordPos] != character) return NOT_A_VALID_WORD_POS; + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + // If we come here we know that so far, we do match. Either we are on a terminal + // and we match the length, in which case we found it, or we traverse children. + // If we don't match the length AND don't have children, then a word in the + // dictionary fully matches a prefix of the searched word but not the full word. + ++wordPos; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + if (wordPos == length) { + return ptNodePos; + } + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + return NOT_A_VALID_WORD_POS; + } + // We have children and we are still shorter than the word we are searching for, so + // we need to traverse children. Put the pointer on the children position, and + // break + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + break; + } else { + // This PtNode does not match, so skip the remaining part and go to the next. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, + &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + --ptNodeCount; + } + } } -int PatriciaTriePolicy::getUnigramProbability( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int nodePos) const { +int PatriciaTriePolicy::getUnigramProbability(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_PROBABILITY; } - const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot(); int pos = nodePos; const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::isTerminal(flags)) { return NOT_A_PROBABILITY; } @@ -79,90 +323,79 @@ int PatriciaTriePolicy::getUnigramProbability( // for shortcuts). return NOT_A_PROBABILITY; } - PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos); - return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + return PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); } -int PatriciaTriePolicy::getShortcutPositionOfNode( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const { +int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot(); int pos = nodePos; const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { return NOT_A_DICT_POS; } - PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos); + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); } if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(dictRoot, flags, &pos); + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); } return pos; } -int PatriciaTriePolicy::getBigramsPositionOfNode( - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const { +int PatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot(); int pos = nodePos; const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { return NOT_A_DICT_POS; } - PatriciaTrieReadingUtils::skipCharacters(dictRoot, flags, MAX_WORD_LENGTH, &pos); + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); if (PatriciaTrieReadingUtils::isTerminal(flags)) { - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); } if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { - PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(dictRoot, flags, &pos); + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); } if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - BinaryDictionaryTerminalAttributesReadingUtils::skipShortcuts(binaryDictionaryInfo, &pos); + mShortcutListPolicy.skipAllShortcuts(&pos);; } return pos; } int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, - const int nodePos, const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const childrenFilter, DicNodeVector *childDicNodes) const { - const uint8_t *const dictRoot = binaryDictionaryInfo->getDictRoot(); + const int nodePos, DicNodeVector *childDicNodes) const { int pos = nodePos; const PatriciaTrieReadingUtils::NodeFlags flags = - PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictRoot, &pos); + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); int mergedNodeCodePoints[MAX_WORD_LENGTH]; const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( - dictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); + mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? - PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictRoot, &pos) + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) : NOT_A_PROBABILITY; const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - dictRoot, flags, &pos) : NOT_A_DICT_POS; + mDictRoot, flags, &pos) : NOT_A_DICT_POS; if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { - BinaryDictionaryTerminalAttributesReadingUtils::skipShortcuts(binaryDictionaryInfo, &pos); + getShortcutsStructurePolicy()->skipAllShortcuts(&pos); } if (PatriciaTrieReadingUtils::hasBigrams(flags)) { - BinaryDictionaryTerminalAttributesReadingUtils::skipExistingBigrams( - binaryDictionaryInfo, &pos); - } - if (!childrenFilter->isFilteredOut(mergedNodeCodePoints[0])) { - childDicNodes->pushLeavingChild(dicNode, nodePos, childrenPos, probability, - PatriciaTrieReadingUtils::isTerminal(flags), - PatriciaTrieReadingUtils::hasChildrenInFlags(flags), - PatriciaTrieReadingUtils::isBlacklisted(flags) || - PatriciaTrieReadingUtils::isNotAWord(flags), - mergedNodeCodePointCount, mergedNodeCodePoints); + getBigramsStructurePolicy()->skipAllBigrams(&pos); } + childDicNodes->pushLeavingChild(dicNode, nodePos, childrenPos, probability, + PatriciaTrieReadingUtils::isTerminal(flags), + PatriciaTrieReadingUtils::hasChildrenInFlags(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) || + PatriciaTrieReadingUtils::isNotAWord(flags), + mergedNodeCodePointCount, mergedNodeCodePoints); return pos; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index 71f256eee..d0567fd85 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -17,15 +17,29 @@ #ifndef LATINIME_PATRICIA_TRIE_POLICY_H #define LATINIME_PATRICIA_TRIE_POLICY_H +#include <stdint.h> + #include "defines.h" -#include "suggest/core/policy/dictionary_structure_policy.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" namespace latinime { -class PatriciaTriePolicy : public DictionaryStructurePolicy { +class DicNode; +class DicNodeVector; + +class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: - static AK_FORCE_INLINE const PatriciaTriePolicy *getInstance() { - return &sInstance; + PatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()), + mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), + mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} + + ~PatriciaTriePolicy() { + delete mBuffer; } AK_FORCE_INLINE int getRootPosition() const { @@ -33,37 +47,64 @@ class PatriciaTriePolicy : public DictionaryStructurePolicy { } void createAndGetAllChildNodes(const DicNode *const dicNode, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const; + DicNodeVector *const childDicNodes) const; int getCodePointsAndProbabilityAndReturnCodePointCount( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const; - int getTerminalNodePositionOfWord( - const BinaryDictionaryInfo *const binaryDictionaryInfo, const int *const inWord, + int getTerminalNodePositionOfWord(const int *const inWord, const int length, const bool forceLowerCaseSearch) const; - int getUnigramProbability(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + int getUnigramProbability(const int nodePos) const; + + int getShortcutPositionOfNode(const int nodePos) const; - int getShortcutPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + int getBigramsPositionOfNode(const int nodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } - int getBigramsPositionOfNode(const BinaryDictionaryInfo *const binaryDictionaryInfo, - const int nodePos) const; + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } private: - DISALLOW_COPY_AND_ASSIGN(PatriciaTriePolicy); - static const PatriciaTriePolicy sInstance; + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); - PatriciaTriePolicy() {} - ~PatriciaTriePolicy() {} + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + const uint8_t *const mDictRoot; + const BigramListPolicy mBigramListPolicy; + const ShortcutListPolicy mShortcutListPolicy; int createAndGetLeavingChildNode(const DicNode *const dicNode, const int nodePos, - const BinaryDictionaryInfo *const binaryDictionaryInfo, - const NodeFilter *const nodeFilter, DicNodeVector *const childDicNodes) const; + DicNodeVector *const childDicNodes) const; }; } // namespace latinime #endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp index 89e981df8..576a158bc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp @@ -17,21 +17,21 @@ #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "defines.h" -#include "suggest/core/dictionary/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { typedef PatriciaTrieReadingUtils PtReadingUtils; -const PtReadingUtils::NodeFlags PtReadingUtils::MASK_GROUP_ADDRESS_TYPE = 0xC0; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80; -const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; // Flag for single/multiple char group const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; -// Flag for terminal groups +// Flag for terminal PtNodes const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; // Flag for shortcut targets presence const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; @@ -46,14 +46,14 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; const uint8_t *const buffer, const NodeFlags flags, int *const pos) { const int base = *pos; int offset = 0; - switch (MASK_GROUP_ADDRESS_TYPE & flags) { - case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE: + switch (MASK_CHILDREN_POSITION_TYPE & flags) { + case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); break; - case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES: + case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); break; - case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES: + case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); break; default: diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h index 002c3f19b..f76c38751 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h @@ -20,7 +20,7 @@ #include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { @@ -28,7 +28,7 @@ class PatriciaTrieReadingUtils { public: typedef uint8_t NodeFlags; - static AK_FORCE_INLINE int getGroupCountAndAdvancePosition( + static AK_FORCE_INLINE int getPtNodeArraySizeAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); if (firstByte < 0x80) { @@ -116,17 +116,17 @@ class PatriciaTrieReadingUtils { } static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { - return FLAG_GROUP_ADDRESS_TYPE_NOADDRESS != (MASK_GROUP_ADDRESS_TYPE & flags); + return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); } private: DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); - static const NodeFlags MASK_GROUP_ADDRESS_TYPE; - static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_NOADDRESS; - static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_ONEBYTE; - static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_TWOBYTES; - static const NodeFlags FLAG_GROUP_ADDRESS_TYPE_THREEBYTES; + static const NodeFlags MASK_CHILDREN_POSITION_TYPE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; static const NodeFlags FLAG_IS_TERMINAL; diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h new file mode 100644 index 000000000..5e9c52950 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H +#define LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +/* + * This is a dynamic version of ShortcutListPolicy and supports an additional buffer. + */ +class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit DynamicShortcutListPolicy(BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~DynamicShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return pos + ShortcutListReadingUtils::getShortcutListSizeFieldSize(); + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(buffer, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + buffer, maxCodePointCount, outCodePoint, pos); + } + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + void skipAllShortcuts(int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + // Copy shortcuts from the shortcut list that starts at fromPos to toPos and advance these + // positions after the shortcut lists. + void copyAllShortcuts(int *const fromPos, int *const toPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, fromPos); + // Copy shortcut list size. + mBuffer->writeUintAndAdvancePosition( + shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos); + for (int i = 0; i < shortcutListSize; ++i) { + const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition(buffer, fromPos); + mBuffer->writeUintAndAdvancePosition(data, 1 /* size */, toPos); + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); + + BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h new file mode 100644 index 000000000..d73f73953 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_POLICY_H +#define LATINIME_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +namespace latinime { + +class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit ShortcutListPolicy(const uint8_t *const shortcutBuf) + : mShortcutsBuf(shortcutBuf) {} + + ~ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int listPos = pos; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mShortcutsBuf, &listPos); + return listPos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mShortcutsBuf, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + mShortcutsBuf, maxCodePointCount, outCodePoint, pos); + } + } + + void skipAllShortcuts(int *const pos) const { + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos); + *pos += shortcutListSize; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); + + const uint8_t *const mShortcutsBuf; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp new file mode 100644 index 000000000..e70bb5071 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +namespace latinime { + +// Flag for presence of more attributes +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; +// The numeric value of the shortcut probability that means 'whitelist'. +const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h new file mode 100644 index 000000000..5f4f240f5 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H +#define LATINIME_SHORTCUT_LIST_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +class ShortcutListReadingUtils { + public: + typedef uint8_t ShortcutFlags; + + static AK_FORCE_INLINE ShortcutFlags getFlagsAndForwardPointer( + const uint8_t *const dictRoot, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(dictRoot, pos); + } + + static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // This method returns the size of the shortcut list region excluding the shortcut list size + // field at the beginning. + static AK_FORCE_INLINE int getShortcutListSizeAndForwardPointer( + const uint8_t *const dictRoot, int *const pos) { + // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. + return ByteArrayUtils::readUint16AndAdvancePosition(dictRoot, pos) + - SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { + return SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE void skipShortcuts(const uint8_t *const dictRoot, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(dictRoot, pos); + *pos += shortcutListSize; + } + + static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { + return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; + } + + static AK_FORCE_INLINE int readShortcutTarget( + const uint8_t *const dictRoot, const int maxLength, int *const outWord, + int *const pos) { + return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); + + static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; + static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; + static const int WHITELIST_SHORTCUT_PROBABILITY; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 000000000..8582c4b81 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::INITIAL_ADDITIONAL_BUFFER_SIZE = 16 * 1024; +const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 16 * 1024; + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..ec871ec85 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include <cstddef> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize) + : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), + mAdditionalBuffer(INITIAL_ADDITIONAL_BUFFER_SIZE), mUsedAdditionalBufferSize(0) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBufferSize + mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= mOriginalBufferSize; + } + + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return &mAdditionalBuffer[0]; + } else { + return mOriginalBuffer; + } + } + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBufferSize; + } + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + AK_FORCE_INLINE bool writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const size_t INITIAL_ADDITIONAL_BUFFER_SIZE; + static const size_t MAX_ADDITIONAL_BUFFER_SIZE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + uint8_t *const mOriginalBuffer; + const int mOriginalBufferSize; + std::vector<uint8_t> mAdditionalBuffer; + int mUsedAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + AK_FORCE_INLINE bool extendBuffer() { + if (mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP + > MAX_ADDITIONAL_BUFFER_SIZE) { + return false; + } + mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); + return true; + } + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + AK_FORCE_INLINE bool checkAndPrepareWriting(const int pos, const int size) { + if (isInAdditionalBuffer(pos)) { + if (pos == mUsedAdditionalBufferSize) { + // Append data to the tail. + if (pos + size > static_cast<int>(mAdditionalBuffer.size())) { + // Need to extend buffer. + if (!extendBuffer()) { + return false; + } + } + mUsedAdditionalBufferSize += size; + } else if (pos + size >= mUsedAdditionalBufferSize) { + // The access will beyond the tail of used region. + return false; + } + } else { + if (pos < 0 || mOriginalBufferSize < pos + size) { + // Invalid position or violate the boundary. + return false; + } + } + return true; + } +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp index 68b1d5d15..a84cfb9d5 100644 --- a/native/jni/src/suggest/core/dictionary/byte_array_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "suggest/core/dictionary/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { diff --git a/native/jni/src/suggest/core/dictionary/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h index 75ccfc766..1d14929c7 100644 --- a/native/jni/src/suggest/core/dictionary/byte_array_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -29,7 +29,34 @@ namespace latinime { class ByteArrayUtils { public: /** - * Integer + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading * * Each method read a corresponding size integer in a big endian manner. */ @@ -187,6 +214,32 @@ class ByteArrayUtils { static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } }; } // namespace latinime #endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index 0e8d72f2e..3796c7b7b 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ -#include "suggest/core/dictionary/binary_dictionary_format_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { @@ -22,22 +24,19 @@ namespace latinime { * Dictionary size */ // Any file smaller than this is not a dictionary. -const int BinaryDictionaryFormatUtils::DICTIONARY_MINIMUM_SIZE = 4; +const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 4; /** * Format versions */ - -// The versions of Latin IME that only handle format version 1 only test for the magic -// number, so we had to change it so that version 2 files would be rejected by older -// implementations. On this occasion, we made the magic number 32 bits long. -const uint32_t BinaryDictionaryFormatUtils::HEADER_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; +// 32 bit magic number is stored at the beginning of the dictionary header to reject unsupported +// or obsolete dictionary formats. +const uint32_t FormatUtils::HEADER_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; // Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12 -const int BinaryDictionaryFormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12; +const int FormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12; -/* static */ BinaryDictionaryFormatUtils::FORMAT_VERSION - BinaryDictionaryFormatUtils::detectFormatVersion(const uint8_t *const dict, - const int dictSize) { +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const uint8_t *const dict, const int dictSize) { // The magic number is stored big-endian. // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't // understand this format. diff --git a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index 830684c70..f84321577 100644 --- a/native/jni/src/suggest/core/dictionary/binary_dictionary_format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -14,24 +14,19 @@ * limitations under the License. */ -#ifndef LATINIME_BINARY_DICTIONARY_FORMAT_UTILS_H -#define LATINIME_BINARY_DICTIONARY_FORMAT_UTILS_H +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H #include <stdint.h> #include "defines.h" -#include "suggest/core/dictionary/byte_array_utils.h" namespace latinime { /** * Methods to handle binary dictionary format version. - * - * Currently, we have a file with a similar name, binary_format.h. binary_format.h contains binary - * reading methods and utility methods for various purposes. - * On the other hand, this file deals with only about dictionary format version. */ -class BinaryDictionaryFormatUtils { +class FormatUtils { public: enum FORMAT_VERSION { VERSION_2, @@ -42,11 +37,11 @@ class BinaryDictionaryFormatUtils { static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BinaryDictionaryFormatUtils); + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); static const int DICTIONARY_MINIMUM_SIZE; static const uint32_t HEADER_VERSION_2_MAGIC_NUMBER; static const int HEADER_VERSION_2_MINIMUM_SIZE; }; } // namespace latinime -#endif /* LATINIME_BINARY_DICTIONARY_FORMAT_UTILS_H */ +#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h new file mode 100644 index 000000000..6febd7832 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include <cerrno> +#include <fcntl.h> +#include <stdint.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "defines.h" + +namespace latinime { + +class MmappedBuffer { + public: + static MmappedBuffer* openBuffer(const char *const path, const int pathLength, + const int bufferOffset, const int bufferSize, const bool isUpdatable) { + const int openMode = isUpdatable ? O_RDWR : O_RDONLY; + const int mmapFd = open(path, openMode); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return 0; + } + const int pagesize = getpagesize(); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return 0; + } + uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return 0; + } + return new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, + isUpdatable); + } + + ~MmappedBuffer() { + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } + } + + AK_FORCE_INLINE uint8_t *getBuffer() const { + return mBuffer; + } + + AK_FORCE_INLINE int getBufferSize() const { + return mBufferSize; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mBuffer(buffer), mBufferSize(bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + uint8_t *const mBuffer; + const int mBufferSize; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index 7cddb0882..b6aa85896 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -155,7 +155,8 @@ class TypingWeighting : public Weighting { float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) const { - return DicNodeUtils::getBigramNodeImprobability(traverseSession->getBinaryDictionaryInfo(), + return DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; } diff --git a/native/jni/src/utils/autocorrection_threshold_utils.cpp b/native/jni/src/utils/autocorrection_threshold_utils.cpp index 3406e0f8e..1f8ee0814 100644 --- a/native/jni/src/utils/autocorrection_threshold_utils.cpp +++ b/native/jni/src/utils/autocorrection_threshold_utils.cpp @@ -83,9 +83,12 @@ const int AutocorrectionThresholdUtils::FULL_WORD_MULTIPLIER = 2; return 0.0f; } + if (score <= 0 || distance >= afterLength) { + // normalizedScore must be 0.0f (the minimum value) if the score is less than or equal to 0, + // or if the edit distance is larger than or equal to afterLength. + return 0.0f; + } // add a weight based on edit distance. - // distance <= max(afterLength, beforeLength) == afterLength, - // so, 0 <= distance / afterLength <= 1 const float weight = 1.0f - static_cast<float>(distance) / static_cast<float>(afterLength); // TODO: Revise the following logic thoroughly by referring to... |