diff options
Diffstat (limited to 'native')
41 files changed, 1955 insertions, 622 deletions
diff --git a/native/jni/Android.mk b/native/jni/Android.mk index d83bdadfa..0594ddff0 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -70,9 +70,10 @@ LATIN_IME_CORE_SRC_FILES := \ bigram/bigram_list_read_write_utils.cpp \ bigram/dynamic_bigram_list_policy.cpp \ header/header_policy.cpp \ - header/header_reading_utils.cpp \ + header/header_read_write_utils.cpp \ shortcut/shortcut_list_reading_utils.cpp \ dictionary_structure_with_buffer_policy_factory.cpp \ + dynamic_patricia_trie_gc_event_listeners.cpp \ dynamic_patricia_trie_node_reader.cpp \ dynamic_patricia_trie_policy.cpp \ dynamic_patricia_trie_reading_helper.cpp \ @@ -84,6 +85,7 @@ LATIN_IME_CORE_SRC_FILES := \ $(addprefix suggest/policyimpl/dictionary/utils/, \ buffer_with_extendable_buffer.cpp \ byte_array_utils.cpp \ + dict_file_writing_utils.cpp \ format_utils.cpp) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ $(addprefix suggest/policyimpl/typing/, \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 7f47493b2..7761ec4d5 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -26,12 +26,55 @@ #include "suggest/core/dictionary/dictionary.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" #include "utils/autocorrection_threshold_utils.h" namespace latinime { class ProximityInfo; +// TODO: Move to makedict. +static jboolean latinime_BinaryDictionary_createEmptyDictFile(JNIEnv *env, jclass clazz, + jstring filePath, jlong dictVersion, jobjectArray attributeKeyStringArray, + jobjectArray attributeValueStringArray) { + const jsize filePathUtf8Length = env->GetStringUTFLength(filePath); + char filePathChars[filePathUtf8Length + 1]; + env->GetStringUTFRegion(filePath, 0, env->GetStringLength(filePath), filePathChars); + filePathChars[filePathUtf8Length] = '\0'; + + const int keyCount = env->GetArrayLength(attributeKeyStringArray); + const int valueCount = env->GetArrayLength(attributeValueStringArray); + if (keyCount != valueCount) { + return false; + } + + HeaderReadWriteUtils::AttributeMap attributeMap; + for (int i = 0; i < keyCount; i++) { + jstring keyString = static_cast<jstring>( + env->GetObjectArrayElement(attributeKeyStringArray, i)); + const jsize keyUtf8Length = env->GetStringUTFLength(keyString); + char keyChars[keyUtf8Length + 1]; + env->GetStringUTFRegion(keyString, 0, env->GetStringLength(keyString), keyChars); + keyChars[keyUtf8Length] = '\0'; + HeaderReadWriteUtils::AttributeMap::key_type key; + HeaderReadWriteUtils::insertCharactersIntoVector(keyChars, &key); + + jstring valueString = static_cast<jstring>( + env->GetObjectArrayElement(attributeValueStringArray, i)); + const jsize valueUtf8Length = env->GetStringUTFLength(valueString); + char valueChars[valueUtf8Length + 1]; + env->GetStringUTFRegion(valueString, 0, env->GetStringLength(valueString), valueChars); + valueChars[valueUtf8Length] = '\0'; + HeaderReadWriteUtils::AttributeMap::mapped_type value; + HeaderReadWriteUtils::insertCharactersIntoVector(valueChars, &value); + + attributeMap[key] = value; + } + + return DictFileWritingUtils::createEmptyDictFile(filePathChars, static_cast<int>(dictVersion), + &attributeMap); +} + static jlong latinime_BinaryDictionary_open(JNIEnv *env, jclass clazz, jstring sourceDir, jlong dictOffset, jlong dictSize, jboolean isUpdatable) { PROF_OPEN; @@ -282,6 +325,11 @@ static int latinime_BinaryDictionary_calculateProbabilityNative(JNIEnv *env, jcl static const JNINativeMethod sMethods[] = { { + const_cast<char *>("createEmptyDictFileNative"), + const_cast<char *>("(Ljava/lang/String;J[Ljava/lang/String;[Ljava/lang/String;)Z"), + reinterpret_cast<void *>(latinime_BinaryDictionary_createEmptyDictFile) + }, + { const_cast<char *>("openNative"), const_cast<char *>("(Ljava/lang/String;JJZ)J"), reinterpret_cast<void *>(latinime_BinaryDictionary_open) diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index 5ba71c168..71f4ef6ea 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -147,7 +147,7 @@ int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const in int pos = mDictionaryStructurePolicy->getTerminalNodePositionOfWord(prevWord, prevWordLength, forceLowerCaseSearch); if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; - return mDictionaryStructurePolicy->getBigramsPositionOfNode(pos); + return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); } int BigramDictionary::getBigramProbability(const int *word0, int length0, const int *word1, diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index aecf41386..4633c07b0 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -68,7 +68,7 @@ class MultiBigramMap { void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos) { - const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), bigramsListPos); while (bigramsIt.hasNext()) { @@ -112,7 +112,7 @@ class MultiBigramMap { const DictionaryStructureWithBufferPolicy *const structurePolicy, const int nodePos, const int nextWordPosition, const int unigramProbability) { int bigramProbability = NOT_A_PROBABILITY; - const int bigramsListPos = structurePolicy->getBigramsPositionOfNode(nodePos); + const int bigramsListPos = structurePolicy->getBigramsPositionOfPtNode(nodePos); BinaryDictionaryBigramsIterator bigramsIt(structurePolicy->getBigramsStructurePolicy(), bigramsListPos); while (bigramsIt.hasNext()) { diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index 24d1b8ba1..b95488ebd 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -52,9 +52,9 @@ class DictionaryStructureWithBufferPolicy { virtual int getUnigramProbabilityOfPtNode(const int nodePos) const = 0; - virtual int getShortcutPositionOfNode(const int nodePos) const = 0; + virtual int getShortcutPositionOfPtNode(const int nodePos) const = 0; - virtual int getBigramsPositionOfNode(const int nodePos) const = 0; + virtual int getBigramsPositionOfPtNode(const int nodePos) const = 0; virtual const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const = 0; diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp index 0c925be25..b1340e12f 100644 --- a/native/jni/src/suggest/core/suggest.cpp +++ b/native/jni/src/suggest/core/suggest.cpp @@ -223,7 +223,7 @@ int Suggest::outputSuggestions(DicTraverseSession *traverseSession, int *frequen BinaryDictionaryShortcutIterator shortcutIt( traverseSession->getDictionaryStructurePolicy()->getShortcutsStructurePolicy(), traverseSession->getDictionaryStructurePolicy() - ->getShortcutPositionOfNode(terminalDicNode->getPos())); + ->getShortcutPositionOfPtNode(terminalDicNode->getPos())); // Shortcut is not supported for multiple words suggestions. // TODO: Check shortcuts during traversal for multiple words suggestions. const bool sameAsTyped = TRAVERSAL->sameAsTyped(traverseSession, terminalDicNode); diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h index 26015d512..6ff95cac4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h @@ -33,10 +33,9 @@ class BigramListPolicy : public DictionaryBigramsStructurePolicy { void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const pos) const { - const BigramListReadWriteUtils::BigramFlags flags = - BigramListReadWriteUtils::getFlagsAndForwardPointer(mBigramsBuf, pos); - *outBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( - mBigramsBuf, flags, pos); + BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBigramsBuf, &flags, + outBigramPos, pos); *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); *outHasNext = BigramListReadWriteUtils::hasNext(flags); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp index 09e832f07..1926b9831 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp @@ -16,7 +16,9 @@ #include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -38,23 +40,31 @@ const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; -/* static */ BigramListReadWriteUtils::BigramFlags - BigramListReadWriteUtils::getFlagsAndForwardPointer(const uint8_t *const bigramsBuf, - int *const pos) { - return ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); +/* static */ void BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + const uint8_t *const bigramsBuf, BigramFlags *const outBigramFlags, + int *const outTargetPtNodePos, int *const bigramEntryPos) { + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, + bigramEntryPos); + if (outBigramFlags) { + *outBigramFlags = bigramFlags; + } + const int targetPos = getBigramAddressAndAdvancePosition(bigramsBuf, bigramFlags, + bigramEntryPos); + if (outTargetPtNodePos) { + *outTargetPtNodePos = targetPos; + } } /* static */ void BigramListReadWriteUtils::skipExistingBigrams(const uint8_t *const bigramsBuf, - int *const pos) { - BigramFlags flags = getFlagsAndForwardPointer(bigramsBuf, pos); - while (hasNext(flags)) { - *pos += attributeAddressSize(flags); - flags = getFlagsAndForwardPointer(bigramsBuf, pos); - } - *pos += attributeAddressSize(flags); + int *const bigramListPos) { + BigramFlags flags; + do { + getBigramEntryPropertiesAndAdvancePosition(bigramsBuf, &flags, 0 /* outTargetPtNodePos */, + bigramListPos); + } while(hasNext(flags)); } -/* static */ int BigramListReadWriteUtils::getBigramAddressAndForwardPointer( +/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { int offset = 0; const int origin = *pos; @@ -69,8 +79,10 @@ const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); break; } - if (offset == 0) { + if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID) { return NOT_A_DICT_POS; + } else if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET) { + return origin; } if (isOffsetNegative(flags)) { return origin - offset; @@ -79,4 +91,92 @@ const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; } } +/* static */ bool BigramListReadWriteUtils::setHasNextFlag( + BufferWithExtendableBuffer *const buffer, const bool hasNext, const int entryPos) { + const bool usesAdditionalBuffer = buffer->isInAdditionalBuffer(entryPos); + int readingPos = entryPos; + if (usesAdditionalBuffer) { + readingPos -= buffer->getOriginalBufferSize(); + } + BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition( + buffer->getBuffer(usesAdditionalBuffer), &readingPos); + if (hasNext) { + bigramFlags = bigramFlags | FLAG_ATTRIBUTE_HAS_NEXT; + } else { + bigramFlags = bigramFlags & (~FLAG_ATTRIBUTE_HAS_NEXT); + } + int writingPos = entryPos; + return buffer->writeUintAndAdvancePosition(bigramFlags, 1 /* size */, &writingPos); +} + +/* static */ bool BigramListReadWriteUtils::createAndWriteBigramEntry( + BufferWithExtendableBuffer *const buffer, const int targetPos, const int probability, + const bool hasNext, int *const writingPos) { + BigramFlags flags; + if (!createAndGetBigramFlags(*writingPos, targetPos, probability, hasNext, &flags)) { + return false; + } + return writeBigramEntry(buffer, flags, targetPos, writingPos); +} + +/* static */ bool BigramListReadWriteUtils::writeBigramEntry( + BufferWithExtendableBuffer *const bufferToWrite, const BigramFlags flags, + const int targetPtNodePos, int *const writingPos) { + const int offset = getBigramTargetOffset(targetPtNodePos, *writingPos); + const BigramFlags flagsToWrite = (offset < 0) ? + (flags | FLAG_ATTRIBUTE_OFFSET_NEGATIVE) : (flags & ~FLAG_ATTRIBUTE_OFFSET_NEGATIVE); + if (!bufferToWrite->writeUintAndAdvancePosition(flagsToWrite, 1 /* size */, writingPos)) { + return false; + } + const uint32_t absOffest = abs(offset); + const int bigramTargetFieldSize = attributeAddressSize(flags); + return bufferToWrite->writeUintAndAdvancePosition(absOffest, bigramTargetFieldSize, + writingPos); +} + +// Returns true if the bigram entry is valid and put entry flags into out*. +/* static */ bool BigramListReadWriteUtils::createAndGetBigramFlags(const int entryPos, + const int targetPtNodePos, const int probability, const bool hasNext, + BigramFlags *const outBigramFlags) { + BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; + if (hasNext) { + flags |= FLAG_ATTRIBUTE_HAS_NEXT; + } + const int offset = getBigramTargetOffset(targetPtNodePos, entryPos); + if (offset < 0) { + flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + } + const uint32_t absOffest = abs(offset); + if ((absOffest >> 24) != 0) { + // Offset is too large. + return false; + } else if ((absOffest >> 16) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + } else if ((absOffest >> 8) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + } else { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + } + // Currently, all newly written bigram position fields are 3 bytes to simplify dictionary + // writing. + // TODO: Remove following 2 lines and optimize memory space. + flags = (flags & (~MASK_ATTRIBUTE_ADDRESS_TYPE)) | FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + *outBigramFlags = flags; + return true; +} + +/* static */ int BigramListReadWriteUtils::getBigramTargetOffset(const int targetPtNodePos, + const int entryPos) { + if (targetPtNodePos == NOT_A_DICT_POS) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else { + const int offset = targetPtNodePos - (entryPos + 1 /* bigramFlagsField */); + if (offset == 0) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } else { + return offset; + } + } +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h index 884bcd7a9..eabe4e099 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h @@ -24,11 +24,15 @@ namespace latinime { +class BufferWithExtendableBuffer; + class BigramListReadWriteUtils { public: typedef uint8_t BigramFlags; - static BigramFlags getFlagsAndForwardPointer(const uint8_t *const bigramsBuf, int *const pos); + static void getBigramEntryPropertiesAndAdvancePosition(const uint8_t *const bigramsBuf, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + int *const bigramEntryPos); static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { return flags & MASK_ATTRIBUTE_PROBABILITY; @@ -39,10 +43,7 @@ public: } // Bigrams reading methods - static void skipExistingBigrams(const uint8_t *const bigramsBuf, int *const pos); - - static int getBigramAddressAndForwardPointer(const uint8_t *const bigramsBuf, - const BigramFlags flags, int *const pos); + static void skipExistingBigrams(const uint8_t *const bigramsBuf, int *const bigramListPos); // Returns the size of the bigram position field that is stored in bigram flags. static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { @@ -58,50 +59,19 @@ public: */ } - static AK_FORCE_INLINE BigramFlags setHasNextFlag(const BigramFlags flags) { - return flags | FLAG_ATTRIBUTE_HAS_NEXT; - } + static bool setHasNextFlag(BufferWithExtendableBuffer *const buffer, + const bool hasNext, const int entryPos); static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags, const int probability) { return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY); } - // Returns true if the bigram entry is valid and put entry values into out*. - static AK_FORCE_INLINE bool createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( - const int entryPos, const int targetPos, const int probability, const bool hasNext, - BigramFlags *const outBigramFlags, uint32_t *const outOffset, - int *const outOffsetFieldSize) { - if (targetPos == NOT_A_DICT_POS) { - return false; - } - BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; - if (hasNext) { - flags |= FLAG_ATTRIBUTE_HAS_NEXT; - } - const int targetFieldPos = entryPos + 1; - const int offset = targetPos - targetFieldPos; - if (offset < 0) { - flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; - } - const uint32_t absOffest = abs(offset); - if ((absOffest >> 24) != 0) { - // Offset is too large. - return false; - } else if ((absOffest >> 16) != 0) { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; - *outOffsetFieldSize = 3; - } else if ((absOffest >> 8) != 0) { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; - *outOffsetFieldSize = 2; - } else { - flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; - *outOffsetFieldSize = 1; - } - *outBigramFlags = flags; - *outOffset = absOffest; - return true; - } + static bool createAndWriteBigramEntry(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int probability, const bool hasNext, int *const writingPos); + + static bool writeBigramEntry(BufferWithExtendableBuffer *const buffer, const BigramFlags flags, + const int targetOffset, int *const writingPos); private: DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); @@ -115,9 +85,18 @@ private: static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; static const int ATTRIBUTE_ADDRESS_SHIFT; + // Returns true if the bigram entry is valid and put entry flags into out*. + static bool createAndGetBigramFlags(const int entryPos, const int targetPos, + const int probability, const bool hasNext, BigramFlags *const outBigramFlags); + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; } + + static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf, + const BigramFlags flags, int *const pos); + + static int getBigramTargetOffset(const int targetPtNodePos, const int entryPos); }; } // namespace latinime #endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp index 4c44d22fd..29307b56a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -16,58 +16,73 @@ #include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + namespace latinime { -const int DynamicBigramListPolicy::BIGRAM_LINK_COUNT_LIMIT = 10000; +const int DynamicBigramListPolicy::CONTINUING_BIGRAM_LINK_COUNT_LIMIT = 10000; +const int DynamicBigramListPolicy::BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT = 100000; void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, - bool *const outHasNext, int *const pos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + bool *const outHasNext, int *const bigramEntryPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramEntryPos); const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - *pos -= mBuffer->getOriginalBufferSize(); + *bigramEntryPos -= mBuffer->getOriginalBufferSize(); } - const BigramListReadWriteUtils::BigramFlags flags = - BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); - int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( - buffer, flags, pos); + BigramListReadWriteUtils::BigramFlags bigramFlags; + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(buffer, &bigramFlags, + &originalBigramPos, bigramEntryPos); if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { originalBigramPos += mBuffer->getOriginalBufferSize(); } *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); - *outHasNext = BigramListReadWriteUtils::hasNext(flags); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); + *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); if (usesAdditionalBuffer) { - *pos += mBuffer->getOriginalBufferSize(); + *bigramEntryPos += mBuffer->getOriginalBufferSize(); } } -void DynamicBigramListPolicy::skipAllBigrams(int *const pos) const { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); +void DynamicBigramListPolicy::skipAllBigrams(int *const bigramListPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - *pos -= mBuffer->getOriginalBufferSize(); + *bigramListPos -= mBuffer->getOriginalBufferSize(); } - BigramListReadWriteUtils::skipExistingBigrams(buffer, pos); + BigramListReadWriteUtils::skipExistingBigrams(buffer, bigramListPos); if (usesAdditionalBuffer) { - *pos += mBuffer->getOriginalBufferSize(); + *bigramListPos += mBuffer->getOriginalBufferSize(); } } -bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPos, - int *outBigramsCount) { +bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos, int *const outBigramsCount) const { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); if (usesAdditionalBuffer) { *fromPos -= mBuffer->getOriginalBufferSize(); } *outBigramsCount = 0; - BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + int lastWrittenEntryPos = NOT_A_DICT_POS; do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } // The buffer address can be changed after calling buffer writing methods. - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, fromPos); - int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( - buffer, flags, fromPos); + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + fromPos); if (originalBigramPos == NOT_A_DICT_POS) { // skip invalid bigram entry. continue; @@ -76,132 +91,223 @@ bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPo originalBigramPos += mBuffer->getOriginalBufferSize(); } const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - BigramListReadWriteUtils::BigramFlags newBigramFlags; - uint32_t newBigramOffset; - int newBigramOffsetFieldSize; - if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( - *toPos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags), - BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset, - &newBigramOffsetFieldSize)) { + if (bigramPos == NOT_A_DICT_POS) { + // Target PtNode has been invalidated. continue; } - // Write bigram entry. Target buffer is always the additional buffer. - if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */,toPos)) { + lastWrittenEntryPos = *toPos; + if (!BigramListReadWriteUtils::createAndWriteBigramEntry(bufferToWrite, bigramPos, + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), + BigramListReadWriteUtils::hasNext(bigramFlags), toPos)) { return false; } - if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, - toPos)) { + (*outBigramsCount)++; + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + // Makes the last entry the terminal of the list. Updates the flags. + if (lastWrittenEntryPos != NOT_A_DICT_POS) { + if (!BigramListReadWriteUtils::setHasNextFlag(bufferToWrite, false /* hasNext */, + lastWrittenEntryPos)) { return false; } - (*outBigramsCount)++; - } while(BigramListReadWriteUtils::hasNext(flags)); + } if (usesAdditionalBuffer) { *fromPos += mBuffer->getOriginalBufferSize(); } return true; } -bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramPos, - const int probability, int *const pos) { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); +// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode +// has been deleted or is not a valid terminal. +bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( + int *const bigramListPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + if (originalBigramPos == NOT_A_DICT_POS) { + // This entry has already been removed. + continue; + } + if (usesAdditionalBuffer) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramTargetNodePos = + followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos); + // TODO: Update probability for supporting probability decaying. + if (nodeReader.isDeleted() || !nodeReader.isTerminal() + || bigramTargetNodePos == NOT_A_DICT_POS) { + // The target is no longer valid terminal. Invalidate the current bigram entry. + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) { + return false; + } + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return true; +} + +// Updates bigram target PtNode positions in the list after the placing step in GC. +bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); if (usesAdditionalBuffer) { - *pos -= mBuffer->getOriginalBufferSize(); + *bigramListPos -= mBuffer->getOriginalBufferSize(); } - BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; do { - int entryPos = *pos; + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + int bigramTargetPtNodePos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &bigramTargetPtNodePos, + bigramListPos); + if (bigramTargetPtNodePos == NOT_A_DICT_POS) { + continue; + } + if (usesAdditionalBuffer) { + bigramTargetPtNodePos += mBuffer->getOriginalBufferSize(); + } + + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + ptNodePositionRelocationMap->find(bigramTargetPtNodePos); + if (it != ptNodePositionRelocationMap->end()) { + bigramTargetPtNodePos = it->second; + } else { + bigramTargetPtNodePos = NOT_A_DICT_POS; + } + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + bigramTargetPtNodePos, &bigramEntryPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return true; +} + +bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, + const int probability, int *const bigramListPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int entryPos = *bigramListPos; if (usesAdditionalBuffer) { entryPos += mBuffer->getOriginalBufferSize(); } + int originalBigramPos; // The buffer address can be changed after calling buffer writing methods. - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); - int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( - buffer, flags, pos); + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { originalBigramPos += mBuffer->getOriginalBufferSize(); } - if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramPos) { + if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { // Update this bigram entry. const BigramListReadWriteUtils::BigramFlags updatedFlags = - BigramListReadWriteUtils::setProbabilityInFlags(flags, probability); - return mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos); + BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability); + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, + originalBigramPos, &entryPos); } - if (BigramListReadWriteUtils::hasNext(flags)) { + if (BigramListReadWriteUtils::hasNext(bigramFlags)) { continue; } // The current last entry is found. // First, update the flags of the last entry. - const BigramListReadWriteUtils::BigramFlags updatedFlags = - BigramListReadWriteUtils::setHasNextFlag(flags); - if (!mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos)) { + if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { return false; } if (usesAdditionalBuffer) { - *pos += mBuffer->getOriginalBufferSize(); + *bigramListPos += mBuffer->getOriginalBufferSize(); } // Then, add a new entry after the last entry. - return writeNewBigramEntry(bigramPos, probability, pos); - } while(BigramListReadWriteUtils::hasNext(flags)); + return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); // We return directly from the while loop. ASSERT(false); return false; } -bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramPos, const int probability, +bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability, int *const writingPos) { - BigramListReadWriteUtils::BigramFlags newBigramFlags; - uint32_t newBigramOffset; - int newBigramOffsetFieldSize; - if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( - *writingPos, bigramPos, probability, false /* hasNext */, &newBigramFlags, - &newBigramOffset, &newBigramOffsetFieldSize)) { - return false; - } - // Write bigram flags. - if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */, writingPos)) { - return false; - } - // Write bigram positon offset. - if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, - writingPos)) { - return false; - } - return true; + // hasNext is false because we are adding a new bigram entry at the end of the bigram list. + return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, + probability, false /* hasNext */, writingPos); } -bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int targetBigramPos) { +bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); int pos = bigramListPos; if (usesAdditionalBuffer) { pos -= mBuffer->getOriginalBufferSize(); } - BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = pos; + int originalBigramPos; // The buffer address can be changed after calling buffer writing methods. - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); - flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, &pos); - int bigramOffsetFieldPos = pos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, &pos); if (usesAdditionalBuffer) { - bigramOffsetFieldPos += mBuffer->getOriginalBufferSize(); + bigramEntryPos += mBuffer->getOriginalBufferSize(); } - int originalBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( - buffer, flags, &pos); if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { originalBigramPos += mBuffer->getOriginalBufferSize(); } const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); - if (bigramPos != targetBigramPos) { + if (bigramPos != bigramTargetPos) { continue; } - // Target entry is found. Write 0 into the bigram pos field to mark the bigram invalid. - const int bigramOffsetFieldSize = BigramListReadWriteUtils::attributeAddressSize(flags); - if (!mBuffer->writeUintAndAdvancePosition(0 /* data */, bigramOffsetFieldSize, - &bigramOffsetFieldPos)) { - return false; - } - return true; - } while(BigramListReadWriteUtils::hasNext(flags)); + // Target entry is found. Write an invalid target position to mark the bigram invalid. + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); return false; } @@ -212,14 +318,14 @@ int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( } int currentPos = originalBigramPos; DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); - nodeReader.fetchNodeInfoFromBuffer(currentPos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); int bigramLinkCount = 0; while (nodeReader.getBigramLinkedNodePos() != NOT_A_DICT_POS) { currentPos = nodeReader.getBigramLinkedNodePos(); - nodeReader.fetchNodeInfoFromBuffer(currentPos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); bigramLinkCount++; - if (bigramLinkCount > BIGRAM_LINK_COUNT_LIMIT) { - AKLOGI("Bigram link is invalid. start position: %d", bigramPos); + if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) { + AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos); ASSERT(false); return NOT_A_DICT_POS; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index dafb62d80..8ea318a41 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -21,13 +21,13 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" -#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" -#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" namespace latinime { +class BufferWithExtendableBuffer; +class DictionaryShortcutsStructurePolicy; + /* * This is a dynamic version of BigramListPolicy and supports an additional buffer. */ @@ -40,27 +40,36 @@ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { ~DynamicBigramListPolicy() {} void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, - int *const pos) const; + int *const bigramEntryPos) const; + + void skipAllBigrams(int *const bigramListPos) const; + + // Copy bigrams from the bigram list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after bigram lists. This method skips invalid + // bigram entries and write the valid bigram entry count to outBigramsCount. + bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, + int *const toPos, int *const outBigramsCount) const; - void skipAllBigrams(int *const pos) const; + bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos); - // Copy bigrams from the bigram list that starts at fromPos to toPos and advance these - // positions after bigram lists. This method skips invalid bigram entries and write the valid - // bigram entry count to outBigramsCount. - bool copyAllBigrams(int *const fromPos, int *const toPos, int *outBigramsCount); + bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap); - bool addNewBigramEntryToBigramList(const int bigramPos, const int probability, int *const pos); + bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, + int *const bigramListPos); - bool writeNewBigramEntry(const int bigramPos, const int probability, + bool writeNewBigramEntry(const int bigramTargetPos, const int probability, int *const writingPos); // Return if targetBigramPos is found or not. - bool removeBigram(const int bigramListPos, const int targetBigramPos); + bool removeBigram(const int bigramListPos, const int bigramTargetPos); private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); - static const int BIGRAM_LINK_COUNT_LIMIT; + static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; + static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; BufferWithExtendableBuffer *const mBuffer; const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp new file mode 100644 index 000000000..c60e45819 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" + +namespace latinime { + +bool DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !node->isTerminal(); + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (node->isTerminal()) { + // Remove children as all children are useless. + int writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + mBuffer, NOT_A_DICT_POS /* childrenPosition */, &writingPos)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mWritingHelper->markNodeAsDeleted(node)) { + return false; + } + } else { + valueStack.back() += 1; + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::value_type( + ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (node->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mWritingHelper->writePtNodeToBufferByCopyingPtNodeInfo(mBufferToWrite, node, + node->getParentPos(), nodeCodePoints, node->getCodePointCount(), + node->getProbability(), &writingPos); +} + +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // Updates parent position. + int parentPos = node->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != mDictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = node->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mBufferToWrite, + parentPos, node->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = node->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBufferToWrite, + childrenPos, &writingPos)) { + return false; + } + + // Updates bigram target PtNode positions in the bigram list. + int bigramsPos = node->getBigramsPos(); + if (bigramsPos != NOT_A_DICT_POS) { + if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, + &mDictPositionRelocationMap->mPtNodePositionRelocationMap)) { + return false; + } + } + + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h new file mode 100644 index 000000000..4256f22fb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H + +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class DynamicPatriciaTrieGcEventListeners { + public: + // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or + // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. + // TODO: Concatenate non-terminal PtNodes. + class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const buffer) + : mWritingHelper(writingHelper), mBuffer(buffer), valueStack(), + mChildrenValue(0) {} + + ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; + + bool onAscend() { + if (valueStack.empty()) { + return false; + } + mChildrenValue = valueStack.back(); + valueStack.pop_back(); + return true; + } + + bool onDescend(const int ptNodeArrayPos) { + valueStack.push_back(0); + return true; + } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS( + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBuffer; + std::vector<int> valueStack; + int mChildrenValue; + }; + + // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram + // entries. + class TraversePolicyToUpdateBigramProbability + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) + : mBigramPolicy(bigramPolicy) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (!node->isDeleted()) { + int pos = node->getBigramsPos(); + if (pos != NOT_A_DICT_POS) { + if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) { + return false; + } + } + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); + + DynamicBigramListPolicy *const mBigramPolicy; + }; + + class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const bufferToWrite, + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), + mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos); + + bool onReadingPtNodeArrayTail(); + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBufferToWrite; + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + int mValidPtNodeCount; + int mPtNodeArraySizeFieldPos; + }; + + class TraversePolicyToUpdateAllPositionFields + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPositionFields( + DynamicPatriciaTrieWritingHelper *const writingHelper, + DynamicBigramListPolicy *const bigramPolicy, + BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), + mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + DynamicBigramListPolicy *const mBigramPolicy; + BufferWithExtendableBuffer *const mBufferToWrite; + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieGcEventListeners); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp index 737098423..456352c17 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp @@ -23,26 +23,27 @@ namespace latinime { -void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(const int nodePos, - const int maxCodePointCount, int *const outCodePoints) { - if (nodePos < 0 || nodePos >= mBuffer->getTailPosition()) { +void DynamicPatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { AKLOGE("Fetching PtNode info form invalid dictionary position: %d, dictionary size: %d", - nodePos, mBuffer->getTailPosition()); + ptNodePos, mBuffer->getTailPosition()); ASSERT(false); invalidatePtNodeInfo(); return; } - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(nodePos); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); - int pos = nodePos; - mHeadPos = nodePos; + int pos = ptNodePos; + mHeadPos = ptNodePos; if (usesAdditionalBuffer) { pos -= mBuffer->getOriginalBufferSize(); } mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); - const int parentPos = - DynamicPatriciaTrieReadingUtils::getParentPosAndAdvancePosition(dictBuf, &pos); - mParentPos = (parentPos != 0) ? nodePos + parentPos : NOT_A_DICT_POS; + const int parentPosOffset = + DynamicPatriciaTrieReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(dictBuf, + &pos); + mParentPos = DynamicPatriciaTrieReadingUtils::getParentPtNodePos(parentPosOffset, mHeadPos); if (outCodePoints != 0) { mCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( dictBuf, mFlags, maxCodePointCount, outCodePoints, &pos); @@ -99,7 +100,8 @@ void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(c // Read destination node if the read node is a moved node. if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { // The destination position is stored at the same place as the parent position. - fetchNodeInfoFromBufferAndProcessMovedNode(mParentPos, maxCodePointCount, outCodePoints); + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(mParentPos, maxCodePointCount, + outCodePoints); } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h index 6ef5f5813..3b36d425f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h @@ -48,17 +48,17 @@ class DynamicPatriciaTrieNodeReader { ~DynamicPatriciaTrieNodeReader() {} - // Reads node information from dictionary buffer and updates members with the information. - AK_FORCE_INLINE void fetchNodeInfoFromBuffer(const int nodePos) { - fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos , 0 /* maxCodePointCount */, - 0 /* outCodePoints */); + // Reads PtNode information from dictionary buffer and updates members with the information. + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) { + fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(ptNodePos , + 0 /* maxCodePointCount */, 0 /* outCodePoints */); } - AK_FORCE_INLINE void fetchNodeInfoFromBufferAndGetNodeCodePoints(const int nodePos, - const int maxCodePointCount, int *const outCodePoints) { + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { mSiblingPos = NOT_A_DICT_POS; mBigramLinkedNodePos = NOT_A_DICT_POS; - fetchNodeInfoFromBufferAndProcessMovedNode(nodePos, maxCodePointCount, outCodePoints); + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, maxCodePointCount, outCodePoints); } // HeadPos is different from NodePos when the current PtNode is a moved PtNode. @@ -154,8 +154,8 @@ class DynamicPatriciaTrieNodeReader { int mBigramPos; int mSiblingPos; - void fetchNodeInfoFromBufferAndProcessMovedNode(const int nodePos, const int maxCodePointCount, - int *const outCodePoints); + void fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int maxCodePointCount, int *const outCodePoints); void invalidatePtNodeInfo(); }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index 3cfbfd85b..42397c19e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -35,7 +35,7 @@ void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const d } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos()); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(), @@ -48,7 +48,7 @@ void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const d } int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int nodePos, const int maxCodePointCount, int *const outCodePoints, + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. @@ -56,9 +56,9 @@ int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCoun DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); // First, read the terminal node and get its probability. - readingHelper.initWithNodePos(nodePos); + readingHelper.initWithPtNodePos(ptNodePos); if (!readingHelper.isValidTerminalNode()) { - // Node at the nodePos is not a valid terminal node. + // Node at the ptNodePos is not a valid terminal node. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } @@ -67,7 +67,7 @@ int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCoun // Then, following parent node link to the dictionary root and fetch node code points. while (!readingHelper.isEnd()) { if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { - // The nodePos is not a valid terminal node position in the dictionary. + // The ptNodePos is not a valid terminal node position in the dictionary. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } @@ -98,7 +98,7 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithNodeArrayPos(getRootPosition()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); while (!readingHelper.isEnd()) { const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); @@ -148,39 +148,39 @@ int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, } } -int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoFromBuffer(nodePos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { return NOT_A_PROBABILITY; } return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); } -int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoFromBuffer(nodePos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); if (nodeReader.isDeleted()) { return NOT_A_DICT_POS; } return nodeReader.getShortcutPos(); } -int DynamicPatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - nodeReader.fetchNodeInfoFromBuffer(nodePos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); if (nodeReader.isDeleted()) { return NOT_A_DICT_POS; } @@ -195,7 +195,7 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int } DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - readingHelper.initWithNodeArrayPos(getRootPosition()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, &mBigramListPolicy, &mShortcutListPolicy); return writingHelper.addUnigramWord(&readingHelper, word, length, probability); @@ -248,7 +248,9 @@ void DynamicPatriciaTriePolicy::flush(const char *const filePath) { AKLOGI("Warning: flush() is called for non-updatable dictionary."); return; } - // TODO: Implement. + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + writingHelper.writeToDictFile(filePath, &mHeaderPolicy); } void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { @@ -256,7 +258,9 @@ void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); return; } - // TODO: Implement. + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); } bool DynamicPatriciaTriePolicy::needsToRunGC() const { @@ -264,8 +268,8 @@ bool DynamicPatriciaTriePolicy::needsToRunGC() const { AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); return false; } - // TODO: Implement. - return false; + // TODO: Implement more properly. + return mBufferWithExtendableBuffer.isNearSizeLimit(); } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 2cbb0ff3b..06d8095d8 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -33,7 +33,7 @@ class DicNodeVector; class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) - : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()), + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mShortcutListPolicy(&mBufferWithExtendableBuffer), @@ -51,7 +51,7 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { DicNodeVector *const childDicNodes) const; int getCodePointsAndProbabilityAndReturnCodePointCount( - const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, + const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const; int getTerminalNodePositionOfWord(const int *const inWord, @@ -59,11 +59,11 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getProbability(const int unigramProbability, const int bigramProbability) const; - int getUnigramProbabilityOfPtNode(const int nodePos) const; + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; - int getShortcutPositionOfNode(const int nodePos) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; - int getBigramsPositionOfNode(const int nodePos) const; + int getBigramsPositionOfPtNode(const int ptNodePos) const; const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { return &mHeaderPolicy; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp index a0b5be6a4..f4a2ef389 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -23,36 +23,167 @@ namespace latinime { // To avoid infinite loop caused by invalid or malicious forward links. const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPatriciaTrieReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + if (!alreadyVisitedChildren) { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + pushReadingStateToStack(); + while (!isEnd()) { + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} // Read node array size and process empty node arrays. Nodes and arrays are counted up in this // method to avoid an infinite loop. -void DynamicPatriciaTrieReadingHelper::nextNodeArray() { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); +void DynamicPatriciaTrieReadingHelper::nextPtNodeArray() { + mReadingState.mPosOfLastPtNodeArrayHead = mReadingState.mPos; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - mPos -= mBuffer->getOriginalBufferSize(); + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); } - mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf, - &mPos); + mReadingState.mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &mReadingState.mPos); if (usesAdditionalBuffer) { - mPos += mBuffer->getOriginalBufferSize(); + mReadingState.mPos += mBuffer->getOriginalBufferSize(); } // Count up nodes and node arrays to avoid infinite loop. - mTotalNodeCount += mNodeCount; - mNodeArrayCount++; - if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP - || mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + mReadingState.mTotalNodeCount += mReadingState.mNodeCount; + mReadingState.mNodeArrayCount++; + if (mReadingState.mNodeCount < 0 + || mReadingState.mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { // Invalid dictionary. AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", - mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, - mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + mReadingState.mNodeCount, mReadingState.mTotalNodeCount, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mReadingState.mNodeArrayCount, + MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); ASSERT(false); mIsError = true; - mPos = NOT_A_DICT_POS; + mReadingState.mPos = NOT_A_DICT_POS; return; } - if (mNodeCount == 0) { + if (mReadingState.mNodeCount == 0) { // Empty node array. Try following forward link. followForwardLink(); } @@ -60,24 +191,24 @@ void DynamicPatriciaTrieReadingHelper::nextNodeArray() { // Follow the forward link and read the next node array if exists. void DynamicPatriciaTrieReadingHelper::followForwardLink() { - const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - mPos -= mBuffer->getOriginalBufferSize(); + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); } const int forwardLinkPosition = - DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos); + DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos); if (usesAdditionalBuffer) { - mPos += mBuffer->getOriginalBufferSize(); + mReadingState.mPos += mBuffer->getOriginalBufferSize(); } - mPosOfLastForwardLinkField = mPos; + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { // Follow the forward link. - mPos += forwardLinkPosition; - nextNodeArray(); + mReadingState.mPos += forwardLinkPosition; + nextPtNodeArray(); } else { // All node arrays have been read. - mPos = NOT_A_DICT_POS; + mReadingState.mPos = NOT_A_DICT_POS; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h index 120fd7699..c6d8ddcf7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -17,6 +17,9 @@ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H +#include <cstddef> +#include <vector> + #include "defines.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" @@ -34,12 +37,35 @@ class DictionaryShortcutsStructurePolicy; */ class DynamicPatriciaTrieReadingHelper { public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, const DictionaryBigramsStructurePolicy *const bigramsPolicy, const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) - : mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), - mTotalNodeCount(0), mNodeArrayCount(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), - mBuffer(buffer), mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy) {} + : mIsError(false), mReadingState(), mBuffer(buffer), + mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy), mReadingStateStack() {} ~DynamicPatriciaTrieReadingHelper() {} @@ -48,41 +74,43 @@ class DynamicPatriciaTrieReadingHelper { } AK_FORCE_INLINE bool isEnd() const { - return mPos == NOT_A_DICT_POS; + return mReadingState.mPos == NOT_A_DICT_POS; } - // Initialize reading state with the head position of a node array. - AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) { - if (nodeArrayPos == NOT_A_DICT_POS) { - mPos = NOT_A_DICT_POS; + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; } else { mIsError = false; - mPos = nodeArrayPos; - mNodeCount = 0; - mPrevTotalCodePointCount = 0; - mTotalNodeCount = 0; - mNodeArrayCount = 0; - mPosOfLastForwardLinkField = NOT_A_DICT_POS; - nextNodeArray(); + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); if (!isEnd()) { - fetchNodeInfo(); + fetchPtNodeInfo(); } } } // Initialize reading state with the head position of a node. - AK_FORCE_INLINE void initWithNodePos(const int nodePos) { - if (nodePos == NOT_A_DICT_POS) { - mPos = NOT_A_DICT_POS; + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; } else { mIsError = false; - mPos = nodePos; - mNodeCount = 1; - mPrevTotalCodePointCount = 0; - mTotalNodeCount = 1; - mNodeArrayCount = 1; - mPosOfLastForwardLinkField = NOT_A_DICT_POS; - fetchNodeInfo(); + mReadingState.mPos = ptNodePos; + mReadingState.mNodeCount = 1; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + fetchPtNodeInfo(); } } @@ -100,12 +128,12 @@ class DynamicPatriciaTrieReadingHelper { // Return code point count exclude the last read node's code points. AK_FORCE_INLINE int getPrevTotalCodePointCount() const { - return mPrevTotalCodePointCount; + return mReadingState.mPrevTotalCodePointCount; } // Return code point count include the last read node's code points. AK_FORCE_INLINE int getTotalCodePointCount() const { - return mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); + return mReadingState.mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); } AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( @@ -121,85 +149,136 @@ class DynamicPatriciaTrieReadingHelper { } AK_FORCE_INLINE void readNextSiblingNode() { - mNodeCount -= 1; - mPos = mNodeReader.getSiblingNodePos(); - if (mNodeCount <= 0) { + mReadingState.mNodeCount -= 1; + mReadingState.mPos = mNodeReader.getSiblingNodePos(); + if (mReadingState.mNodeCount <= 0) { // All nodes in the current node array have been read. followForwardLink(); if (!isEnd()) { - fetchNodeInfo(); + fetchPtNodeInfo(); } } else { - fetchNodeInfo(); + fetchPtNodeInfo(); } } // Read the first child node of the current node. AK_FORCE_INLINE void readChildNode() { if (mNodeReader.hasChildren()) { - mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); - mTotalNodeCount = 0; - mNodeArrayCount = 0; - mPos = mNodeReader.getChildrenPos(); - mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPos = mNodeReader.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; // Read children node array. - nextNodeArray(); + nextPtNodeArray(); if (!isEnd()) { - fetchNodeInfo(); + fetchPtNodeInfo(); } } else { - mPos = NOT_A_DICT_POS; + mReadingState.mPos = NOT_A_DICT_POS; } } // Read the parent node of the current node. AK_FORCE_INLINE void readParentNode() { if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { - mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); - mTotalNodeCount = 1; - mNodeArrayCount = 1; - mNodeCount = 1; - mPos = mNodeReader.getParentPos(); - mPosOfLastForwardLinkField = NOT_A_DICT_POS; - fetchNodeInfo(); + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mNodeCount = 1; + mReadingState.mPos = mNodeReader.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + fetchPtNodeInfo(); } else { - mPos = NOT_A_DICT_POS; + mReadingState.mPos = NOT_A_DICT_POS; } } AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { - return mPosOfLastForwardLinkField; + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfLastPtNodeArrayHead; + } + + AK_FORCE_INLINE void reloadCurrentPtNodeInfo() { + if (!isEnd()) { + fetchPtNodeInfo(); + } } + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); + class ReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + ReadingState() : mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), + mTotalNodeCount(0), mNodeArrayCount(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfLastPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Node count of a node array. + int mNodeCount; + int mPrevTotalCodePointCount; + int mTotalNodeCount; + int mNodeArrayCount; + int mPosOfLastForwardLinkField; + int mPosOfLastPtNodeArrayHead; + }; + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; bool mIsError; - int mPos; - // Node count of a node array. - int mNodeCount; - int mPrevTotalCodePointCount; - int mTotalNodeCount; - int mNodeArrayCount; - int mPosOfLastForwardLinkField; + ReadingState mReadingState; const BufferWithExtendableBuffer *const mBuffer; DynamicPatriciaTrieNodeReader mNodeReader; int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + std::vector<ReadingState> mReadingStateStack; - void nextNodeArray(); + void nextPtNodeArray(); void followForwardLink(); - AK_FORCE_INLINE void fetchNodeInfo() { - mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH, - mMergedNodeCodePoints); + AK_FORCE_INLINE void fetchPtNodeInfo() { + mNodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(mReadingState.mPos, + MAX_WORD_LENGTH, mMergedNodeCodePoints); if (mNodeReader.getCodePointCount() <= 0) { // Empty node is not allowed. mIsError = true; - mPos = NOT_A_DICT_POS; + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + fetchPtNodeInfo(); } } }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp index 8428c0b15..d68446db6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp @@ -28,24 +28,42 @@ const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_MOVED = 0x40; const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; +// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. +// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum +// value of offsets, which is 0x7FFFFF is used to represent 0 offset. +const int DptReadingUtils::DICT_OFFSET_INVALID = 0; +const int DptReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; + /* static */ int DptReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, const int pos) { int linkAddressPos = pos; return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); } -/* static */ int DptReadingUtils::getParentPosAndAdvancePosition(const uint8_t *const buffer, - int *const pos) { +/* static */ int DptReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); } +/* static */ int DptReadingUtils::getParentPtNodePos(const int parentOffset, const int ptNodePos) { + if (parentOffset == DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { + return ptNodePos; + } else { + return parentOffset + ptNodePos; + } +} + /* static */ int DptReadingUtils::readChildrenPositionAndAdvancePosition( const uint8_t *const buffer, int *const pos) { const int base = *pos; const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); - if (offset == 0) { - // 0 offset means that the node does not have children. + if (offset == DICT_OFFSET_INVALID) { + // The PtNode does not have children. return NOT_A_DICT_POS; + } else if (offset == DICT_OFFSET_ZERO_OFFSET) { + return base; } else { return base + offset; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h index db5f9b1bd..67c3cc57e 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h @@ -27,13 +27,19 @@ class DynamicPatriciaTrieReadingUtils { public: typedef uint8_t NodeFlags; + static const int DICT_OFFSET_INVALID; + static const int DICT_OFFSET_ZERO_OFFSET; + static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { return forwardLinkAddress != 0; } - static int getParentPosAndAdvancePosition(const uint8_t *const buffer, int *const pos); + static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + static int getParentPtNodePos(const int parentOffset, const int ptNodePos); static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp index 311d31e5d..578645cd5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -17,16 +17,22 @@ #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "utils/hash_map_compat.h" namespace latinime { const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; +// TODO: Make MAX_DICTIONARY_SIZE 8MB. +const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024; bool DynamicPatriciaTrieWritingHelper::addUnigramWord( DynamicPatriciaTrieReadingHelper *const readingHelper, @@ -83,7 +89,7 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const const int probability) { int mMergedNodeCodePoints[MAX_WORD_LENGTH]; DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, + nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, mMergedNodeCodePoints); // Move node to add bigram entry. const int newNodePos = mBuffer->getTailPosition(); @@ -91,13 +97,13 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const return false; } int writingPos = newNodePos; - // Write a new PtNode using original PtNode's info to the tail of the dictionary. - if (!writePtNodeToBufferByCopyingPtNodeInfo(&nodeReader, nodeReader.getParentPos(), + // Write a new PtNode using original PtNode's info to the tail of the dictionary in mBuffer. + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, &nodeReader, nodeReader.getParentPos(), mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(), &writingPos)) { return false; } - nodeReader.fetchNodeInfoFromBuffer(newNodePos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(newNodePos); if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { // Insert a new bigram entry into the existing bigram list. int bigramListPos = nodeReader.getBigramsPos(); @@ -124,13 +130,56 @@ bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const // Remove a bigram relation from word0Pos to word1Pos. bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoFromBuffer(word0Pos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(word0Pos); if (nodeReader.getBigramsPos() == NOT_A_DICT_POS) { return false; } return mBigramPolicy->removeBigram(nodeReader.getBigramsPos(), word1Pos); } +void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, + const HeaderPolicy *const headerPolicy) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); +} + +void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const fileName, const HeaderPolicy *const headerPolicy) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) { + return; + } + BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, + MAX_DICTIONARY_SIZE); + if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); +} + +bool DynamicPatriciaTrieWritingHelper::markNodeAsDeleted( + const DynamicPatriciaTrieNodeReader *const nodeToUpdate) { + int pos = nodeToUpdate->getHeadPos(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */); + int writingPos = nodeToUpdate->getHeadPos(); + // Update flags. + return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos); +} + bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( const DynamicPatriciaTrieNodeReader *const originalNode, const int movedPos, const int bigramLinkedNodePos) { @@ -153,9 +202,8 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( return false; } // Update moved position, which is stored in the parent offset field. - const int movedPosOffset = movedPos - originalNode->getHeadPos(); - if (!DynamicPatriciaTrieWritingUtils::writeParentOffsetAndAdvancePosition( - mBuffer, movedPosOffset, &writingPos)) { + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, movedPos, originalNode->getHeadPos(), &writingPos)) { return false; } // Update bigram linked node position, which is stored in the children position field. @@ -168,13 +216,12 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( // Update children's parent position. DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); - readingHelper.initWithNodeArrayPos(originalNode->getChildrenPos()); + readingHelper.initWithPtNodeArrayPos(originalNode->getChildrenPos()); while (!readingHelper.isEnd()) { - const int childPtNodeWrittenPos = nodeReader->getHeadPos(); - const int parentOffset = movedPos - childPtNodeWrittenPos; - int parentOffsetFieldPos = childPtNodeWrittenPos + 1 /* Flags */; - if (!DynamicPatriciaTrieWritingUtils::writeParentOffsetAndAdvancePosition( - mBuffer, parentOffset, &parentOffsetFieldPos)) { + int parentOffsetFieldPos = nodeReader->getHeadPos() + + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, movedPos, nodeReader->getHeadPos(), &parentOffsetFieldPos)) { // Parent offset cannot be written because of a bug or a broken dictionary; thus, // we give up to update dictionary. return false; @@ -186,7 +233,8 @@ bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( } // Write new PtNode at writingPos. -bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer(const bool isBlacklisted, +bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const bool isBlacklisted, const bool isNotAWord, const int parentPos, const int *const codePoints, const int codePointCount, const int probability, const int childrenPos, const int originalBigramListPos, const int originalShortcutListPos, @@ -194,38 +242,38 @@ bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer(const boo const int nodePos = *writingPos; // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the // PtNode writing. - if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, 0 /* nodeFlags */, - writingPos)) { + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, + 0 /* nodeFlags */, writingPos)) { return false; } // Calculate a parent offset and write the offset. - const int parentOffset = (parentPos != NOT_A_DICT_POS) ? parentPos - nodePos : NOT_A_DICT_POS; - if (!DynamicPatriciaTrieWritingUtils::writeParentOffsetAndAdvancePosition(mBuffer, - parentOffset, writingPos)) { + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(bufferToWrite, + parentPos, nodePos, writingPos)) { return false; } // Write code points - if (!DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition(mBuffer, codePoints, - codePointCount, writingPos)) { + if (!DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition(bufferToWrite, + codePoints, codePointCount, writingPos)) { return false; } // Write probability when the probability is a valid probability, which means this node is // terminal. if (probability != NOT_A_PROBABILITY) { - if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(bufferToWrite, probability, writingPos)) { return false; } } // Write children position - if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(bufferToWrite, childrenPos, writingPos)) { return false; } // Copy shortcut list when the originalShortcutListPos is valid dictionary position. if (originalShortcutListPos != NOT_A_DICT_POS) { int fromPos = originalShortcutListPos; - if (!mShortcutPolicy->copyAllShortcutsAndReturnIfSucceededOrNot(&fromPos, writingPos)) { + if (!mShortcutPolicy->copyAllShortcutsAndReturnIfSucceededOrNot(bufferToWrite, &fromPos, + writingPos)) { return false; } } @@ -233,7 +281,7 @@ bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer(const boo int bigramCount = 0; if (originalBigramListPos != NOT_A_DICT_POS) { int fromPos = originalBigramListPos; - if (!mBigramPolicy->copyAllBigrams(&fromPos, writingPos, &bigramCount)) { + if (!mBigramPolicy->copyAllBigrams(bufferToWrite, &fromPos, writingPos, &bigramCount)) { return false; } } @@ -245,27 +293,29 @@ bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer(const boo bigramCount > 0 /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, CHILDREN_POSITION_FIELD_SIZE); int flagsFieldPos = nodePos; - if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, nodeFlags, + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, nodeFlags, &flagsFieldPos)) { return false; } return true; } -bool DynamicPatriciaTrieWritingHelper::writePtNodeToBuffer(const int parentPos, +bool DynamicPatriciaTrieWritingHelper::writePtNodeToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const int parentPos, const int *const codePoints, const int codePointCount, const int probability, int *const writingPos) { - return writePtNodeWithFullInfoToBuffer(false /* isBlacklisted */, false /* isNotAWord */, - parentPos, codePoints, codePointCount, probability, + return writePtNodeWithFullInfoToBuffer(bufferToWrite, false /* isBlacklisted */, + false /* isNotAWord */, parentPos, codePoints, codePointCount, probability, NOT_A_DICT_POS /* childrenPos */, NOT_A_DICT_POS /* originalBigramsPos */, NOT_A_DICT_POS /* originalShortcutPos */, writingPos); } bool DynamicPatriciaTrieWritingHelper::writePtNodeToBufferByCopyingPtNodeInfo( + BufferWithExtendableBuffer *const bufferToWrite, const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, const int *const codePoints, const int codePointCount, const int probability, int *const writingPos) { - return writePtNodeWithFullInfoToBuffer(originalNode->isBlacklisted(), + return writePtNodeWithFullInfoToBuffer(bufferToWrite, originalNode->isBlacklisted(), originalNode->isNotAWord(), parentPos, codePoints, codePointCount, probability, originalNode->getChildrenPos(), originalNode->getBigramsPos(), originalNode->getShortcutPos(), writingPos); @@ -299,8 +349,9 @@ bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { return false; } - if (!writePtNodeToBufferByCopyingPtNodeInfo(originalPtNode, originalPtNode->getParentPos(), - codePoints, originalPtNode->getCodePointCount(), probability, &movedPos)) { + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode, + originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(), + probability, &movedPos)) { return false; } } @@ -328,8 +379,8 @@ bool DynamicPatriciaTrieWritingHelper::createNewPtNodeArrayWithAChildPtNode( 1 /* arraySize */, &writingPos)) { return false; } - if (!writePtNodeToBuffer(parentPtNodePos, nodeCodePoints, nodeCodePointCount, probability, - &writingPos)) { + if (!writePtNodeToBuffer(mBuffer, parentPtNodePos, nodeCodePoints, nodeCodePointCount, + probability, &writingPos)) { return false; } if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, @@ -358,8 +409,9 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( // Write the 1st part of the reallocating node. The children position will be updated later // with actual children position. const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; - if (!writePtNodeToBuffer(reallocatingPtNode->getParentPos(), reallocatingPtNodeCodePoints, - overlappingCodePointCount, newProbability, &writingPos)) { + if (!writePtNodeToBuffer(mBuffer, reallocatingPtNode->getParentPos(), + reallocatingPtNodeCodePoints, overlappingCodePointCount, newProbability, + &writingPos)) { return false; } const int actualChildrenPos = writingPos; @@ -371,14 +423,15 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } // Write the 2nd part of the reallocating node. const int secondPartOfReallocatedPtNodePos = writingPos; - if (!writePtNodeToBufferByCopyingPtNodeInfo(reallocatingPtNode, firstPartOfReallocatedPtNodePos, + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, reallocatingPtNode, + firstPartOfReallocatedPtNodePos, reallocatingPtNodeCodePoints + overlappingCodePointCount, reallocatingPtNode->getCodePointCount() - overlappingCodePointCount, reallocatingPtNode->getProbability(), &writingPos)) { return false; } if (addsExtraChild) { - if (!writePtNodeToBuffer(firstPartOfReallocatedPtNodePos, + if (!writePtNodeToBuffer(mBuffer, firstPartOfReallocatedPtNodePos, newNodeCodePoints + overlappingCodePointCount, newNodeCodePointCount - overlappingCodePointCount, probabilityOfNewPtNode, &writingPos)) { @@ -396,7 +449,7 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( } // Load node info. Information of the 1st part will be fetched. DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); - nodeReader.fetchNodeInfoFromBuffer(firstPartOfReallocatedPtNodePos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos); // Update children position. int childrenPosFieldPos = nodeReader.getChildrenPosFieldPos(); if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, @@ -406,4 +459,53 @@ bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( return true; } +bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + BufferWithExtendableBuffer *const bufferToWrite) { + DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + this, mBuffer); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(mBigramPolicy); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(this, bufferToWrite, + &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instance for the GCed dictionary. + DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); + DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy); + // Create reading helper for the GCed dictionary. + DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, + &newDictShortcutPolicy); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(this, &newDictBigramPolicy, bufferToWrite, + &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + return true; +} + } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h index 20e35abcf..fe1b2437a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -17,7 +17,10 @@ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H +#include <stdint.h> + #include "defines.h" +#include "utils/hash_map_compat.h" namespace latinime { @@ -26,9 +29,24 @@ class DynamicBigramListPolicy; class DynamicPatriciaTrieNodeReader; class DynamicPatriciaTrieReadingHelper; class DynamicShortcutListPolicy; +class HeaderPolicy; class DynamicPatriciaTrieWritingHelper { public: + typedef hash_map_compat<int, int> PtNodeArrayPositionRelocationMap; + typedef hash_map_compat<int, int> PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, DynamicBigramListPolicy *const bigramPolicy, DynamicShortcutListPolicy *const shortcutPolicy) @@ -46,10 +64,27 @@ class DynamicPatriciaTrieWritingHelper { // Remove a bigram relation from word0Pos to word1Pos. bool removeBigramWords(const int word0Pos, const int word1Pos); + void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy); + + void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, + const HeaderPolicy *const headerPolicy); + + // CAVEAT: This method must be called only from inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool markNodeAsDeleted(const DynamicPatriciaTrieNodeReader *const nodeToUpdate); + + // CAVEAT: This method must be called only from this class or inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool writePtNodeToBufferByCopyingPtNodeInfo(BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); static const int CHILDREN_POSITION_FIELD_SIZE; + static const size_t MAX_DICTIONARY_SIZE; BufferWithExtendableBuffer *const mBuffer; DynamicBigramListPolicy *const mBigramPolicy; @@ -58,18 +93,15 @@ class DynamicPatriciaTrieWritingHelper { bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, const int movedPos, const int bigramLinkedNodePos); - bool writePtNodeWithFullInfoToBuffer(const bool isBlacklisted, const bool isNotAWord, + bool writePtNodeWithFullInfoToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool isBlacklisted, const bool isNotAWord, const int parentPos, const int *const codePoints, const int codePointCount, const int probability, const int childrenPos, const int originalBigramListPos, const int originalShortcutListPos, int *const writingPos); - bool writePtNodeToBuffer(const int parentPos, const int *const codePoints, - const int codePointCount, const int probability, int *const writingPos); - - bool writePtNodeToBufferByCopyingPtNodeInfo( - const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, - const int *const codePoints, const int codePointCount, const int probability, - int *const writingPos); + bool writePtNodeToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const int parentPos, const int *const codePoints, const int codePointCount, + const int probability, int *const writingPos); bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); @@ -89,6 +121,8 @@ class DynamicPatriciaTrieWritingHelper { const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, const int probabilityOfNewPtNode, const int *const newNodeCodePoints, const int newNodeCodePointCount); + + bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite); }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp index b261e594d..30ff10cd6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp @@ -36,21 +36,33 @@ const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; const int DynamicPatriciaTrieWritingUtils::PROBABILITY_FIELD_SIZE = 1; const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; +/* static */ bool DynamicPatriciaTrieWritingUtils::writeEmptyDictionary( + BufferWithExtendableBuffer *const buffer, const int rootPos) { + int writingPos = rootPos; + if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { + return false; + } + return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, + &writingPos); +} + /* static */ bool DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, int *const forwardLinkFieldPos) { - const int offset = (forwardLinkPos != NOT_A_DICT_POS) ? - forwardLinkPos - (*forwardLinkFieldPos) : 0; - return writeDictOffset(buffer, offset, forwardLinkFieldPos); + return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); } /* static */ bool DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const size_t arraySize, int *const arraySizeFieldPos) { - if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { + // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to + // simplify updating process. + // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. + /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, arraySizeFieldPos); - } else if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { + } else */ + if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, arraySizeFieldPos); @@ -69,11 +81,10 @@ const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; } // Note that parentOffset is offset from node's head position. -/* static */ bool DynamicPatriciaTrieWritingUtils::writeParentOffsetAndAdvancePosition( - BufferWithExtendableBuffer *const buffer, const int parentOffset, +/* static */ bool DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, int *const parentPosFieldPos) { - int offset = (parentOffset != NOT_A_DICT_POS) ? parentOffset : 0; - return writeDictOffset(buffer, offset, parentPosFieldPos); + return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); } /* static */ bool DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition( @@ -106,13 +117,19 @@ const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; /* static */ bool DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int childrenPosition, int *const childrenPositionFieldPos) { - int offset = (childrenPosition != NOT_A_DICT_POS) ? - childrenPosition - (*childrenPositionFieldPos) : 0; - return writeDictOffset(buffer, offset, childrenPositionFieldPos); + return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), + childrenPositionFieldPos); } /* static */ bool DynamicPatriciaTrieWritingUtils::writeDictOffset( - BufferWithExtendableBuffer *const buffer, const int offset, int *const offsetFieldPos) { + BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, + int *const offsetFieldPos) { + int offset = targetPos - basePos; + if (targetPos == NOT_A_DICT_POS) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else if (offset == 0) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { AKLOGI("offset cannot be written because the offset is too large or too small: %d", offset); diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h index 183ede444..af76bc6b5 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h @@ -28,6 +28,10 @@ class BufferWithExtendableBuffer; class DynamicPatriciaTrieWritingUtils { public: + static const int NODE_FLAG_FIELD_SIZE; + + static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); + static bool writeForwardLinkPositionAndAdvancePosition( BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, int *const forwardLinkFieldPos); @@ -39,8 +43,8 @@ class DynamicPatriciaTrieWritingUtils { const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos); - static bool writeParentOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, - const int parentPosition, int *const parentPosFieldPos); + static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int parentPosition, const int basePos, int *const parentPosFieldPos); static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, const int *const codePoints, const int codePointCount, int *const codePointFieldPos); @@ -63,11 +67,10 @@ class DynamicPatriciaTrieWritingUtils { static const int MAX_DICT_OFFSET_VALUE; static const int MIN_DICT_OFFSET_VALUE; static const int DICT_OFFSET_NEGATIVE_FLAG; - static const int NODE_FLAG_FIELD_SIZE; static const int PROBABILITY_FIELD_SIZE; - static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int offset, - int *const offsetFieldPos); + static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, + const int basePos, int *const offsetFieldPos); }; } // namespace latinime #endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 196da5c97..7bbeacaa0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -17,13 +17,17 @@ #include "suggest/policyimpl/dictionary/header/header_policy.h" #include <cstddef> +#include <cstdio> +#include <ctime> namespace latinime { + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; -const float HeaderPolicy::DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER = 1.0f; +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; // Used for logging. Question mark is used to indicate that the key is not found. @@ -35,8 +39,8 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out return; } std::vector<int> keyCodePointVector; - insertCharactersIntoVector(key, &keyCodePointVector); - HeaderReadingUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector); + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector); if (it == mAttributeMap.end()) { // The key was not found. outValue[0] = '?'; @@ -51,80 +55,77 @@ void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *out } float HeaderPolicy::readMultipleWordCostMultiplier() const { - int attributeValue = 0; - if (getAttributeValueAsInt(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &attributeValue)) { - if (attributeValue <= 0) { - return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); - } - return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(attributeValue); - } else { - return DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER; + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector); + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + &keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); } bool HeaderPolicy::readUsesForgettingCurveFlag() const { - int attributeValue = 0; - if (getAttributeValueAsInt(USES_FORGETTING_CURVE_KEY, &attributeValue)) { - return attributeValue != 0; - } else { - return false; - } + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector); + return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, + false /* defaultValue */); } -// Returns S_INT_MIN when the key is not found or the value is invalid. +// Returns current time when the key is not found or the value is invalid. int HeaderPolicy::readLastUpdatedTime() const { - int attributeValue = 0; - if (getAttributeValueAsInt(LAST_UPDATED_TIME_KEY, &attributeValue)) { - return attributeValue; - } else { - return S_INT_MIN; - } + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector); + return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, + time(0) /* defaultValue */); } -// Returns whether the key is found or not and stores the found value into outValue. -bool HeaderPolicy::getAttributeValueAsInt(const char *const key, int *const outValue) const { - std::vector<int> keyVector; - insertCharactersIntoVector(key, &keyVector); - HeaderReadingUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyVector); - if (it == mAttributeMap.end()) { - // The key was not found. +bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime) const { + int writingPos = 0; + if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, + &writingPos)) { return false; } - *outValue = parseIntAttributeValue(&(it->second)); - return true; -} - -/* static */ HeaderReadingUtils::AttributeMap HeaderPolicy::createAttributeMapAndReadAllAttributes( - const uint8_t *const dictBuf) { - HeaderReadingUtils::AttributeMap attributeMap; - HeaderReadingUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); - return attributeMap; -} - -/* static */ int HeaderPolicy::parseIntAttributeValue( - const std::vector<int> *const attributeValue) { - int value = 0; - bool isNegative = false; - for (size_t i = 0; i < attributeValue->size(); ++i) { - if (i == 0 && attributeValue->at(i) == '-') { - isNegative = true; - } else { - if (!isdigit(attributeValue->at(i))) { - // If not a number, return S_INT_MIN - return S_INT_MIN; - } - value *= 10; - value += attributeValue->at(i) - '0'; + if (!HeaderReadWriteUtils::writeDictionaryFlags(bufferToWrite, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, 0 /* size */, + &writingPos)) { + return false; + } + if (updatesLastUpdatedTime) { + // Set current time as a last updated time. + HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); + std::vector<int> updatedTimekey; + HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0)); + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, + &writingPos)) { + return false; } + } else { + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap, + &writingPos)) { + return false; + } + } + // Writes an actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, + &headerSizeFieldPos)) { + return false; } - return isNegative ? -value : value; + return true; } -/* static */ void HeaderPolicy::insertCharactersIntoVector(const char *const characters, - std::vector<int> *const vector) { - for (int i = 0; characters[i]; ++i) { - vector->push_back(characters[i]); - } +/* static */ HeaderReadWriteUtils::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + HeaderReadWriteUtils::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index 930b475c7..e97c08ca4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -17,25 +17,37 @@ #ifndef LATINIME_HEADER_POLICY_H #define LATINIME_HEADER_POLICY_H -#include <cctype> #include <stdint.h> #include "defines.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" -#include "suggest/policyimpl/dictionary/header/header_reading_utils.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" namespace latinime { class HeaderPolicy : public DictionaryHeaderStructurePolicy { public: - explicit HeaderPolicy(const uint8_t *const dictBuf) - : mDictBuf(dictBuf), mDictionaryFlags(HeaderReadingUtils::getFlags(dictBuf)), - mSize(HeaderReadingUtils::getHeaderSize(dictBuf)), - mAttributeMap(createAttributeMapAndReadAllAttributes(mDictBuf)), + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const int dictSize) + : mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), mUsesForgettingCurve(readUsesForgettingCurveFlag()), mLastUpdatedTime(readLastUpdatedTime()) {} + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), + mMultiWordCostMultiplier(readUsesForgettingCurveFlag()), + mUsesForgettingCurve(readUsesForgettingCurveFlag()), + mLastUpdatedTime(readLastUpdatedTime()) {} + ~HeaderPolicy() {} AK_FORCE_INLINE int getSize() const { @@ -43,16 +55,15 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { } AK_FORCE_INLINE bool supportsDynamicUpdate() const { - return HeaderReadingUtils::supportsDynamicUpdate(mDictionaryFlags); + return HeaderReadWriteUtils::supportsDynamicUpdate(mDictionaryFlags); } AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { - return HeaderReadingUtils::requiresGermanUmlautProcessing(mDictionaryFlags); + return HeaderReadWriteUtils::requiresGermanUmlautProcessing(mDictionaryFlags); } AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { - return HeaderReadingUtils::requiresFrenchLigatureProcessing( - mDictionaryFlags); + return HeaderReadWriteUtils::requiresFrenchLigatureProcessing(mDictionaryFlags); } AK_FORCE_INLINE float getMultiWordCostMultiplier() const { @@ -70,19 +81,22 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; + bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime) const; + private: DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const char *const USES_FORGETTING_CURVE_KEY; static const char *const LAST_UPDATED_TIME_KEY; - static const float DEFAULT_MULTIPLE_WORD_COST_MULTIPLIER; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; - const uint8_t *const mDictBuf; - const HeaderReadingUtils::DictionaryFlags mDictionaryFlags; + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; const int mSize; - HeaderReadingUtils::AttributeMap mAttributeMap; + HeaderReadWriteUtils::AttributeMap mAttributeMap; const float mMultiWordCostMultiplier; const bool mUsesForgettingCurve; const int mLastUpdatedTime; @@ -93,15 +107,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { int readLastUpdatedTime() const; - bool getAttributeValueAsInt(const char *const key, int *const outValue) const; - - static HeaderReadingUtils::AttributeMap createAttributeMapAndReadAllAttributes( + static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( const uint8_t *const dictBuf); - - static int parseIntAttributeValue(const std::vector<int> *const attributeValue); - - static void insertCharactersIntoVector( - const char *const characters, std::vector<int> *const vector); }; } // namespace latinime #endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 000000000..3b1c78085 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +#include <cctype> +#include <cstdio> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; +// Flags for special processing +// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or +// something very bad (like, the apocalypse) will happen. Please update both at the same time. +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +const char *const HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_KEY = "SUPPORTS_DYNAMIC_UPDATE"; +const char *const HeaderReadWriteUtils::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY = + "REQUIRES_FRENCH_LIGATURE_PROCESSING"; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + AttributeMap::key_type key; + insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key); + const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + key.clear(); + insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key); + const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + key.clear(); + insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key); + const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + DictionaryFlags dictflags = NO_FLAGS; + dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; + dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; + dictflags |= supportsDynamicUpdate ? SUPPORTS_DYNAMIC_UPDATE_FLAG : 0; + return dictflags; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; + while (pos < headerSize) { + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); + std::vector<int> key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); + std::vector<int> value; + value.insert(value.end(), valueBuffer, valueBuffer + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + // Version 2 dictionary writing is not supported. + return false; + case FormatUtils::VERSION_3: + return buffer->writeUintAndAdvancePosition(3 /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; + snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector<int> *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h index 5716198fb..caa5097f6 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -14,18 +14,21 @@ * limitations under the License. */ -#ifndef LATINIME_HEADER_READING_UTILS_H -#define LATINIME_HEADER_READING_UTILS_H +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H #include <map> #include <stdint.h> #include <vector> #include "defines.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" namespace latinime { -class HeaderReadingUtils { +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { public: typedef uint16_t DictionaryFlags; typedef std::map<std::vector<int>, std::vector<int> > AttributeMap; @@ -51,11 +54,44 @@ class HeaderReadingUtils { + HEADER_SIZE_FIELD_SIZE; } + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, AttributeMap *const headerAttributes); + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const AttributeMap *const headerAttributes, int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setBoolAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool value); + + static void setIntAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value); + + static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool defaultValue); + + static int readIntAttributeValue(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + AttributeMap::key_type *const key); + private: - DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadingUtils); + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); static const int MAX_ATTRIBUTE_KEY_LENGTH; static const int MAX_ATTRIBUTE_VALUE_LENGTH; @@ -72,7 +108,10 @@ class HeaderReadingUtils { static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; - static const DictionaryFlags CONTAINS_BIGRAMS_FLAG; + + static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; }; } -#endif /* LATINIME_HEADER_READING_UTILS_H */ +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp deleted file mode 100644 index 186c043c1..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_reading_utils.cpp +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/policyimpl/dictionary/header/header_reading_utils.h" - -#include <vector> - -#include "defines.h" -#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" - -namespace latinime { - -const int HeaderReadingUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; -const int HeaderReadingUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; - -const int HeaderReadingUtils::HEADER_MAGIC_NUMBER_SIZE = 4; -const int HeaderReadingUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; -const int HeaderReadingUtils::HEADER_FLAG_SIZE = 2; -const int HeaderReadingUtils::HEADER_SIZE_FIELD_SIZE = 4; - -const HeaderReadingUtils::DictionaryFlags HeaderReadingUtils::NO_FLAGS = 0; -// Flags for special processing -// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or -// something very bad (like, the apocalypse) will happen. Please update both at the same time. -const HeaderReadingUtils::DictionaryFlags - HeaderReadingUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; -const HeaderReadingUtils::DictionaryFlags - HeaderReadingUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; -const HeaderReadingUtils::DictionaryFlags - HeaderReadingUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; - -/* static */ int HeaderReadingUtils::getHeaderSize(const uint8_t *const dictBuf) { - // See the format of the header in the comment in - // BinaryDictionaryFormatUtils::detectFormatVersion() - return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE - + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); -} - -/* static */ HeaderReadingUtils::DictionaryFlags - HeaderReadingUtils::getFlags(const uint8_t *const dictBuf) { - return ByteArrayUtils::readUint16(dictBuf, - HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); -} - -/* static */ void HeaderReadingUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, - AttributeMap *const headerAttributes) { - const int headerSize = getHeaderSize(dictBuf); - int pos = getHeaderOptionsPosition(); - if (pos == NOT_A_DICT_POS) { - // The header doesn't have header options. - return; - } - int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; - int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; - while (pos < headerSize) { - const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); - std::vector<int> key; - key.insert(key.end(), keyBuffer, keyBuffer + keyLength); - const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, - MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); - std::vector<int> value; - value.insert(value.end(), valueBuffer, valueBuffer + valueLength); - headerAttributes->insert(AttributeMap::value_type(key, value)); - } -} - -} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp index e6cff431b..8a84bd261 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -31,9 +31,21 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, return; } int nextPos = dicNode->getChildrenPos(); + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", + nextPos, mDictBufferSize); + ASSERT(false); + return; + } const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( mDictRoot, &nextPos); for (int i = 0; i < childCount; i++) { + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", + nextPos, mDictBufferSize, i, childCount); + ASSERT(false); + return; + } nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); } } @@ -49,7 +61,7 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, // with a z, it's the last PtNode of the root array, so all children addresses will be smaller // than the position we look for, and we have to descend the z node). /* Parameters : - * nodePos: the byte position of the terminal PtNode of the word we are searching for (this is + * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is * what is stored as the "bigram position" in each bigram) * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. * outUnigramProbability: a pointer to an int to write the probability into. @@ -57,7 +69,7 @@ void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, */ // TODO: Split this function to be more readable int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( - const int nodePos, const int maxCodePointCount, int *const outCodePoints, + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { int pos = getRootPosition(); int wordPos = 0; @@ -78,7 +90,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( mDictRoot, &pos); - if (nodePos == startPos) { + if (ptNodePos == startPos) { // We found the position. Copy the rest of the code points in the buffer and return // the length. outCodePoints[wordPos] = character; @@ -121,7 +133,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( // Here comes the tricky part. First, read the children position. const int childrenPos = PatriciaTrieReadingUtils ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); - if (childrenPos > nodePos) { + if (childrenPos > ptNodePos) { // If the children pos is greater than the position, it means the previous // PtNode, which position is stored in lastCandidatePtNodePos, was the right // one. @@ -213,7 +225,7 @@ int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( } } - // If we have looked through all the PtNodes and found no match, the nodePos is + // If we have looked through all the PtNodes and found no match, the ptNodePos is // not the position of a terminal in this dictionary. return 0; } @@ -319,11 +331,11 @@ int PatriciaTriePolicy::getProbability(const int unigramProbability, } } -int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_PROBABILITY; } - int pos = nodePos; + int pos = ptNodePos; const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::isTerminal(flags)) { @@ -341,11 +353,11 @@ int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int nodePos) const { mDictRoot, &pos), NOT_A_PROBABILITY); } -int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - int pos = nodePos; + int pos = ptNodePos; const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { @@ -361,11 +373,11 @@ int PatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) const { return pos; } -int PatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const { - if (nodePos == NOT_A_DICT_POS) { +int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { return NOT_A_DICT_POS; } - int pos = nodePos; + int pos = ptNodePos; const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { @@ -385,8 +397,8 @@ int PatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const { } int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, - const int nodePos, DicNodeVector *childDicNodes) const { - int pos = nodePos; + const int ptNodePos, DicNodeVector *childDicNodes) const { + int pos = ptNodePos; const PatriciaTrieReadingUtils::NodeFlags flags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); int mergedNodeCodePoints[MAX_WORD_LENGTH]; @@ -404,7 +416,12 @@ int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNod if (PatriciaTrieReadingUtils::hasBigrams(flags)) { getBigramsStructurePolicy()->skipAllBigrams(&pos); } - childDicNodes->pushLeavingChild(dicNode, nodePos, childrenPos, probability, + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return pos; + } + childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, PatriciaTrieReadingUtils::isTerminal(flags), PatriciaTrieReadingUtils::hasChildrenInFlags(flags), PatriciaTrieReadingUtils::isBlacklisted(flags) || diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h index cee3e4ab2..f1de914cb 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -34,8 +34,9 @@ class DicNodeVector; class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: PatriciaTriePolicy(const MmappedBuffer *const buffer) - : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()), + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), + mDictBufferSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()), mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} ~PatriciaTriePolicy() { @@ -58,11 +59,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { int getProbability(const int unigramProbability, const int bigramProbability) const; - int getUnigramProbabilityOfPtNode(const int nodePos) const; + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; - int getShortcutPositionOfNode(const int nodePos) const; + int getShortcutPositionOfPtNode(const int ptNodePos) const; - int getBigramsPositionOfNode(const int nodePos) const; + int getBigramsPositionOfPtNode(const int ptNodePos) const; const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { return &mHeaderPolicy; @@ -118,10 +119,11 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { const MmappedBuffer *const mBuffer; const HeaderPolicy mHeaderPolicy; const uint8_t *const mDictRoot; + const int mDictBufferSize; const BigramListPolicy mBigramListPolicy; const ShortcutListPolicy mShortcutListPolicy; - int createAndGetLeavingChildNode(const DicNode *const dicNode, const int nodePos, + int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, DicNodeVector *const childDicNodes) const; }; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp index 1316b425f..7df55815f 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp @@ -71,8 +71,17 @@ const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, pos); } else { - if (maxLength > 0) { - outBuffer[0] = getCodePointAndAdvancePosition(buffer, pos); + const int codePoint = getCodePointAndAdvancePosition(buffer, pos); + if (codePoint == NOT_A_CODE_POINT) { + // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is + // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR + // when the PtNode has a single code point. + length = 0; + AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", + *pos - 1, codePoint, buffer[*pos - 1]); + ASSERT(false); + } else if (maxLength > 0) { + outBuffer[0] = codePoint; length = 1; } } diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h index 1803c09cb..bd3211f6a 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h @@ -31,7 +31,7 @@ namespace latinime { */ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { public: - explicit DynamicShortcutListPolicy(BufferWithExtendableBuffer *const buffer) + explicit DynamicShortcutListPolicy(const BufferWithExtendableBuffer *const buffer) : mBuffer(buffer) {} ~DynamicShortcutListPolicy() {} @@ -82,18 +82,20 @@ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { } } - // Copy shortcuts from the shortcut list that starts at fromPos to toPos and advance these - // positions after the shortcut lists. This returns whether the copy was succeeded or not. - bool copyAllShortcutsAndReturnIfSucceededOrNot(int *const fromPos, int *const toPos) { + // Copy shortcuts from the shortcut list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after the shortcut lists. This returns whether + // the copy was succeeded or not. + bool copyAllShortcutsAndReturnIfSucceededOrNot(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos) const { const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); - const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { *fromPos -= mBuffer->getOriginalBufferSize(); } const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(buffer, fromPos); + ::getShortcutListSizeAndForwardPointer(mBuffer->getBuffer(usesAdditionalBuffer), + fromPos); // Copy shortcut list size. - if (!mBuffer->writeUintAndAdvancePosition( + if (!bufferToWrite->writeUintAndAdvancePosition( shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos)) { return false; @@ -102,7 +104,7 @@ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { for (int i = 0; i < shortcutListSize; ++i) { const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition( mBuffer->getBuffer(usesAdditionalBuffer), fromPos); - if (!mBuffer->writeUintAndAdvancePosition(data, 1 /* size */, toPos)) { + if (!bufferToWrite->writeUintAndAdvancePosition(data, 1 /* size */, toPos)) { return false; } } @@ -115,7 +117,7 @@ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); - BufferWithExtendableBuffer *const mBuffer; + const BufferWithExtendableBuffer *const mBuffer; }; } // namespace latinime #endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp index 0fed275e9..f692882f2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -18,9 +18,10 @@ namespace latinime { -const size_t BufferWithExtendableBuffer::INITIAL_ADDITIONAL_BUFFER_SIZE = 16 * 1024; const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; -const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 16 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos) { @@ -64,6 +65,16 @@ bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *co return true; } +bool BufferWithExtendableBuffer::extendBuffer() { + const size_t sizeAfterExtending = + mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + if (sizeAfterExtending > mMaxAdditionalBufferSize) { + return false; + } + mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); + return true; +} + bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { if (isInAdditionalBuffer(pos)) { const int tailPosition = getTailPosition(); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h index c6a484131..17d2e39c2 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -32,9 +32,11 @@ namespace latinime { // raw pointer but provides several methods that handle boundary checking for writing data. class BufferWithExtendableBuffer { public: - BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize) + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, + const int maxAdditionalBufferSize = MAX_ADDITIONAL_BUFFER_SIZE) : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), - mAdditionalBuffer(INITIAL_ADDITIONAL_BUFFER_SIZE), mUsedAdditionalBufferSize(0) {} + mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} AK_FORCE_INLINE int getTailPosition() const { return mOriginalBufferSize + mUsedAdditionalBufferSize; @@ -61,6 +63,11 @@ class BufferWithExtendableBuffer { return mOriginalBufferSize; } + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + /** * For writing. * @@ -75,28 +82,22 @@ class BufferWithExtendableBuffer { private: DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); - static const size_t INITIAL_ADDITIONAL_BUFFER_SIZE; static const size_t MAX_ADDITIONAL_BUFFER_SIZE; + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; uint8_t *const mOriginalBuffer; const int mOriginalBufferSize; std::vector<uint8_t> mAdditionalBuffer; int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; // Return if the buffer is successfully extended or not. - AK_FORCE_INLINE bool extendBuffer() { - if (mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP - > MAX_ADDITIONAL_BUFFER_SIZE) { - return false; - } - mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); - return true; - } + bool extendBuffer(); // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of // the additional buffer, try extending the buffer. - AK_FORCE_INLINE bool checkAndPrepareWriting(const int pos, const int size); + bool checkAndPrepareWriting(const int pos, const int size); }; } #endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 000000000..2e4ec2e1d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" + +#include <cstdio> +#include <cstring> + +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + switch (dictVersion) { + case 3: + return createEmptyV3DictFile(filePath, attributeMap); + default: + // Only version 3 dictionary is supported for now. + return false; + } +} + +/* static */ bool DictFileWritingUtils::createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); + headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */); + BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { + return false; + } + return flushAllHeaderAndBodyToFile(filePath, &headerBuffer, &bodyBuffer); +} + +/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) { + const int tmpFileNameBufSize = strlen(filePath) + + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */; + // Name of a temporary file used for writing that is a connected string of original name and + // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE. + char tmpFileName[tmpFileNameBufSize]; + snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath, + TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + FILE *const file = fopen(tmpFileName, "wb"); + if (!file) { + AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName); + ASSERT(false); + return false; + } + // Write the dictionary header. + if (!writeBufferToFile(file, dictHeader)) { + remove(tmpFileName); + AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition()); + ASSERT(false); + return false; + } + // Write the dictionary body. + if (!writeBufferToFile(file, dictBody)) { + remove(tmpFileName); + AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + rename(tmpFileName, filePath); + return true; +} + +// This closes file pointer when an error is caused and returns whether the writing was succeeded +// or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + const int additionalBufSize = buffer->getTailPosition() - buffer->getOriginalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 000000000..bd4ac66fd --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include <cstdio> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, + BufferWithExtendableBuffer *const dictBody); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp index 3796c7b7b..1d77d5c27 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -20,20 +20,10 @@ namespace latinime { -/** - * Dictionary size - */ -// Any file smaller than this is not a dictionary. -const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 4; +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; -/** - * Format versions - */ -// 32 bit magic number is stored at the beginning of the dictionary header to reject unsupported -// or obsolete dictionary formats. -const uint32_t FormatUtils::HEADER_VERSION_2_MAGIC_NUMBER = 0x9BC13AFE; -// Magic number (4 bytes), version (2 bytes), options (2 bytes), header size (4 bytes) = 12 -const int FormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12; +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; /* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( const uint8_t *const dict, const int dictSize) { @@ -45,16 +35,10 @@ const int FormatUtils::HEADER_VERSION_2_MINIMUM_SIZE = 12; } const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); switch (magicNumber) { - case HEADER_VERSION_2_MAGIC_NUMBER: - // Version 2 header are at least 12 bytes long. - // If this header has the version 2 magic number but is less than 12 bytes long, - // then it's an unknown format and we need to avoid confidently reading the next bytes. - if (dictSize < HEADER_VERSION_2_MINIMUM_SIZE) { - return UNKNOWN_VERSION; - } + case MAGIC_NUMBER: // Version 2 header is as follows: // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE - // Version number (2 bytes) + // Dictionary format version number (2 bytes) // Options (2 bytes) // Header size (4 bytes) : integer, big endian if (ByteArrayUtils::readUint16(dict, 4) == 2) { diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h index f84321577..79ed0de29 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -34,14 +34,16 @@ class FormatUtils { UNKNOWN_VERSION }; + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); private: DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); static const int DICTIONARY_MINIMUM_SIZE; - static const uint32_t HEADER_VERSION_2_MAGIC_NUMBER; - static const int HEADER_VERSION_2_MINIMUM_SIZE; }; } // namespace latinime #endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index b6aa85896..9f0a331e3 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -74,7 +74,8 @@ class TypingWeighting : public Weighting { // Note: min() required since length can be MAX_POINT_TO_KEY_LENGTH for characters not on // the keyboard (like accented letters) const float normalizedSquaredLength = traverseSession->getProximityInfoState(0) - ->getPointToKeyLength(pointIndex, dicNode->getNodeCodePoint()); + ->getPointToKeyLength(pointIndex, + CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); const float normalizedDistance = TouchPositionCorrectionUtils::getSweetSpotFactor( traverseSession->isTouchPositionCorrectionEnabled(), normalizedSquaredLength); const float weightedDistance = ScoringParams::DISTANCE_WEIGHT_LENGTH * normalizedDistance; @@ -113,10 +114,10 @@ class TypingWeighting : public Weighting { const int16_t parentPointIndex = parentDicNode->getInputIndex(0); const int prevCodePoint = parentDicNode->getNodeCodePoint(); const float distance1 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - parentPointIndex + 1, prevCodePoint); + parentPointIndex + 1, CharUtils::toBaseLowerCase(prevCodePoint)); const int codePoint = dicNode->getNodeCodePoint(); const float distance2 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - parentPointIndex, codePoint); + parentPointIndex, CharUtils::toBaseLowerCase(codePoint)); const float distance = distance1 + distance2; const float weightedLengthDistance = distance * ScoringParams::DISTANCE_WEIGHT_LENGTH; @@ -133,7 +134,7 @@ class TypingWeighting : public Weighting { const bool existsAdjacentProximityChars = traverseSession->getProximityInfoState(0) ->existsAdjacentProximityChars(insertedPointIndex); const float dist = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - insertedPointIndex + 1, dicNode->getNodeCodePoint()); + insertedPointIndex + 1, CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); const float weightedDistance = dist * ScoringParams::DISTANCE_WEIGHT_LENGTH; const bool singleChar = dicNode->getNodeCodePointCount() == 1; float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f); |