diff options
Diffstat (limited to 'native')
22 files changed, 1096 insertions, 340 deletions
diff --git a/native/jni/Android.mk b/native/jni/Android.mk index bf188971e..0e36f3df9 100644 --- a/native/jni/Android.mk +++ b/native/jni/Android.mk @@ -67,19 +67,22 @@ LATIN_IME_CORE_SRC_FILES := \ suggest/core/policy/weighting.cpp \ suggest/core/session/dic_traverse_session.cpp \ $(addprefix suggest/policyimpl/dictionary/, \ - bigram/bigram_list_reading_utils.cpp \ + bigram/bigram_list_read_write_utils.cpp \ + bigram/dynamic_bigram_list_policy.cpp \ header/header_policy.cpp \ header/header_reading_utils.cpp \ shortcut/shortcut_list_reading_utils.cpp \ dictionary_structure_with_buffer_policy_factory.cpp \ dynamic_patricia_trie_node_reader.cpp \ dynamic_patricia_trie_policy.cpp \ + dynamic_patricia_trie_reading_helper.cpp \ dynamic_patricia_trie_reading_utils.cpp \ + dynamic_patricia_trie_writing_helper.cpp \ patricia_trie_policy.cpp \ patricia_trie_reading_utils.cpp) \ $(addprefix suggest/policyimpl/dictionary/utils/, \ + buffer_with_extendable_buffer.cpp \ byte_array_utils.cpp \ - extendable_buffer.cpp \ format_utils.cpp) \ suggest/policyimpl/gesture/gesture_suggest_policy_factory.cpp \ $(addprefix suggest/policyimpl/typing/, \ diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp index ebe76467a..e74a1dbc8 100644 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp @@ -117,9 +117,15 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng mDictionaryStructurePolicy->getBigramsStructurePolicy(), pos); while (bigramsIt.hasNext()) { bigramsIt.next(); - const int length = mDictionaryStructurePolicy-> + if (bigramsIt.getBigramPos() == NOT_A_VALID_WORD_POS) { + continue; + } + const int codePointCount = mDictionaryStructurePolicy-> getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), MAX_WORD_LENGTH, bigramBuffer, &unigramProbability); + if (codePointCount <= 0) { + continue; + } // Due to space constraints, the probability for bigrams is approximate - the lower the // unigram probability, the worse the precision. The theoritical maximum error in // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 @@ -127,8 +133,8 @@ int BigramDictionary::getPredictions(const int *prevWord, const int prevWordLeng // here, but it can't get too bad. const int probability = ProbabilityUtils::computeProbabilityForBigram( unigramProbability, bigramsIt.getProbability()); - addWordBigram(bigramBuffer, length, probability, outBigramProbability, outBigramCodePoints, - outputTypes); + addWordBigram(bigramBuffer, codePointCount, probability, outBigramProbability, + outBigramCodePoints, outputTypes); ++bigramCount; } return min(bigramCount, MAX_RESULTS); diff --git a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h index 97d4cd161..fb4a80083 100644 --- a/native/jni/src/suggest/core/dictionary/multi_bigram_map.h +++ b/native/jni/src/suggest/core/dictionary/multi_bigram_map.h @@ -73,6 +73,9 @@ class MultiBigramMap { bigramsListPos); while (bigramsIt.hasNext()) { bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_VALID_WORD_POS) { + continue; + } mBigramMap[bigramsIt.getBigramPos()] = bigramsIt.getProbability(); mBloomFilter.setInFilter(bigramsIt.getBigramPos()); } diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h index beb9bee27..26015d512 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h @@ -21,7 +21,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" namespace latinime { @@ -33,16 +33,16 @@ class BigramListPolicy : public DictionaryBigramsStructurePolicy { void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const pos) const { - const BigramListReadingUtils::BigramFlags flags = - BigramListReadingUtils::getFlagsAndForwardPointer(mBigramsBuf, pos); - *outBigramPos = BigramListReadingUtils::getBigramAddressAndForwardPointer( + const BigramListReadWriteUtils::BigramFlags flags = + BigramListReadWriteUtils::getFlagsAndForwardPointer(mBigramsBuf, pos); + *outBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( mBigramsBuf, flags, pos); - *outProbability = BigramListReadingUtils::getProbabilityFromFlags(flags); - *outHasNext = BigramListReadingUtils::hasNext(flags); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); } void skipAllBigrams(int *const pos) const { - BigramListReadingUtils::skipExistingBigrams(mBigramsBuf, pos); + BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, pos); } private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp index 6da0e8b85..d57547445 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp @@ -14,30 +14,31 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" #include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" namespace latinime { -const BigramListReadingUtils::BigramFlags BigramListReadingUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30; -const BigramListReadingUtils::BigramFlags - BigramListReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; -const BigramListReadingUtils::BigramFlags - BigramListReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; -const BigramListReadingUtils::BigramFlags - BigramListReadingUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; -const BigramListReadingUtils::BigramFlags - BigramListReadingUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; // Flag for presence of more attributes -const BigramListReadingUtils::BigramFlags BigramListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; // Mask for attribute probability, stored on 4 bits inside the flags byte. -const BigramListReadingUtils::BigramFlags - BigramListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; -const int BigramListReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; -/* static */ int BigramListReadingUtils::getBigramAddressAndForwardPointer( +/* static */ int BigramListReadWriteUtils::getBigramAddressAndForwardPointer( const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { int offset = 0; const int origin = *pos; @@ -52,6 +53,9 @@ const int BigramListReadingUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); break; } + if (offset == 0) { + return NOT_A_VALID_WORD_POS; + } if (isOffsetNegative(flags)) { return origin - offset; } else { diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h index d0c584bd5..ee2b722a4 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h @@ -14,9 +14,10 @@ * limitations under the License. */ -#ifndef LATINIME_BIGRAM_LIST_READING_UTILS_H -#define LATINIME_BIGRAM_LIST_READING_UTILS_H +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#include <cstdlib> #include <stdint.h> #include "defines.h" @@ -24,7 +25,7 @@ namespace latinime { -class BigramListReadingUtils { +class BigramListReadWriteUtils { public: typedef uint8_t BigramFlags; @@ -55,8 +56,62 @@ public: static int getBigramAddressAndForwardPointer(const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos); + // Returns the size of the bigram position field that is stored in bigram flags. + static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { + return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ + } + + static AK_FORCE_INLINE BigramFlags setHasNextFlag(const BigramFlags flags) { + return flags | FLAG_ATTRIBUTE_HAS_NEXT; + } + + // Returns true if the bigram entry is valid and put entry values into out*. + static AK_FORCE_INLINE bool createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + const int entryPos, const int targetPos, const int probability, const bool hasNext, + BigramFlags *const outBigramFlags, uint32_t *const outOffset, + int *const outOffsetFieldSize) { + if (targetPos == NOT_A_VALID_WORD_POS) { + return false; + } + BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; + if (hasNext) { + flags |= FLAG_ATTRIBUTE_HAS_NEXT; + } + const int targetFieldPos = entryPos + 1; + const int offset = targetPos - targetFieldPos; + if (offset < 0) { + flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + } + const uint32_t absOffest = abs(offset); + if ((absOffest >> 24) != 0) { + // Offset is too large. + return false; + } else if ((absOffest >> 16) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + *outOffsetFieldSize = 3; + } else if ((absOffest >> 8) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + *outOffsetFieldSize = 2; + } else { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + *outOffsetFieldSize = 1; + } + *outBigramFlags = flags; + *outOffset = absOffest; + return true; + } + private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadingUtils); + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; @@ -70,19 +125,6 @@ private: static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; } - - static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { - return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; - /* Note: this is a value-dependant optimization of what may probably be - more readably written this way: - switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { - case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; - case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; - default: return 0; - } - */ - } }; } // namespace latinime -#endif // LATINIME_BIGRAM_LIST_READING_UTILS_H +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp new file mode 100644 index 000000000..e31a91069 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" + +namespace latinime { + +bool DynamicBigramListPolicy::copyAllBigrams(int *const fromPos, int *const toPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, fromPos); + int bigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + buffer, flags, fromPos); + if (bigramPos == NOT_A_VALID_WORD_POS) { + // skip invalid bigram entry. + continue; + } + if (usesAdditionalBuffer) { + bigramPos += mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags newBigramFlags; + uint32_t newBigramOffset; + int newBigramOffsetFieldSize; + if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + *toPos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags), + BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset, + &newBigramOffsetFieldSize)) { + continue; + } + // Write bigram entry. Target buffer is always the additional buffer. + if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */,toPos)) { + return false; + } + if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, + toPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(flags)); + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +bool DynamicBigramListPolicy::addBigramEntry(const int bigramPos, const int probability, + int *const pos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + int entryPos = *pos; + if (usesAdditionalBuffer) { + entryPos += mBuffer->getOriginalBufferSize(); + } + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); + BigramListReadWriteUtils::getBigramAddressAndForwardPointer(buffer, flags, pos); + if (BigramListReadWriteUtils::hasNext(flags)) { + continue; + } + // The current last entry is found. + // First, update the flags of the last entry. + const BigramListReadWriteUtils::BigramFlags updatedFlags = + BigramListReadWriteUtils::setHasNextFlag(flags); + if (!mBuffer->writeUintAndAdvancePosition(updatedFlags, 1 /* size */, &entryPos)) { + return false; + } + // Then, add a new entry after the last entry. + BigramListReadWriteUtils::BigramFlags newBigramFlags; + uint32_t newBigramOffset; + int newBigramOffsetFieldSize; + if(!BigramListReadWriteUtils::createBigramEntryAndGetFlagsAndOffsetAndOffsetFieldSize( + *pos, bigramPos, BigramListReadWriteUtils::getProbabilityFromFlags(flags), + BigramListReadWriteUtils::hasNext(flags), &newBigramFlags, &newBigramOffset, + &newBigramOffsetFieldSize)) { + continue; + } + int newEntryPos = *pos; + if (usesAdditionalBuffer) { + newEntryPos += mBuffer->getOriginalBufferSize(); + } + // Write bigram flags. + if (!mBuffer->writeUintAndAdvancePosition(newBigramFlags, 1 /* size */, + &newEntryPos)) { + return false; + } + // Write bigram positon offset. + if (!mBuffer->writeUintAndAdvancePosition(newBigramOffset, newBigramOffsetFieldSize, + &newEntryPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(flags)); + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int targetBigramPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = bigramListPos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags flags; + do { + flags = BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, &pos); + int bigramOffsetFieldPos = pos; + if (usesAdditionalBuffer) { + bigramOffsetFieldPos += mBuffer->getOriginalBufferSize(); + } + int bigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( + buffer, flags, &pos); + if (usesAdditionalBuffer && bigramPos != NOT_A_VALID_WORD_POS) { + bigramPos += mBuffer->getOriginalBufferSize(); + } + if (bigramPos != targetBigramPos) { + continue; + } + // Target entry is found. Write 0 into the bigram pos field to mark the bigram invalid. + const int bigramOffsetFieldSize = + BigramListReadWriteUtils::attributeAddressSize(flags); + if (!mBuffer->writeUintAndAdvancePosition(0 /* data */, bigramOffsetFieldSize, + &bigramOffsetFieldPos)) { + return false; + } + return true; + } while(BigramListReadWriteUtils::hasNext(flags)); + return false; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h index 513878e1d..b7c05376d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -21,8 +21,8 @@ #include "defines.h" #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" -#include "suggest/policyimpl/dictionary/bigram/bigram_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -31,50 +31,57 @@ namespace latinime { */ class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { public: - DynamicBigramListPolicy(const uint8_t *const bigramsBuf, const int bufSize, - const ExtendableBuffer *const additionalBuffer) - : mDictRoot(bigramsBuf), mBufSize(bufSize), mAdditionalBuffer(additionalBuffer) {} + DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} ~DynamicBigramListPolicy() {} void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, int *const pos) const { - const bool usesAdditionalBuffer = *pos >= mBufSize; - const uint8_t *const buffer = (usesAdditionalBuffer) ? - mAdditionalBuffer->getBuffer() : mDictRoot; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - *pos -= mBufSize; + *pos -= mBuffer->getOriginalBufferSize(); } - const BigramListReadingUtils::BigramFlags flags = - BigramListReadingUtils::getFlagsAndForwardPointer(buffer, pos); - *outBigramPos = BigramListReadingUtils::getBigramAddressAndForwardPointer( + const BigramListReadWriteUtils::BigramFlags flags = + BigramListReadWriteUtils::getFlagsAndForwardPointer(buffer, pos); + *outBigramPos = BigramListReadWriteUtils::getBigramAddressAndForwardPointer( buffer, flags, pos); - if (usesAdditionalBuffer) { - *outBigramPos += mBufSize; + if (usesAdditionalBuffer && *outBigramPos != NOT_A_VALID_WORD_POS) { + *outBigramPos += mBuffer->getOriginalBufferSize(); } - *outProbability = BigramListReadingUtils::getProbabilityFromFlags(flags); - *outHasNext = BigramListReadingUtils::hasNext(flags); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); if (usesAdditionalBuffer) { - *pos += mBufSize; + *pos += mBuffer->getOriginalBufferSize(); } } void skipAllBigrams(int *const pos) const { - if (*pos >= mBufSize) { - *pos -= mBufSize; - BigramListReadingUtils::skipExistingBigrams(mAdditionalBuffer->getBuffer(), pos); - *pos += mBufSize; - } else { - BigramListReadingUtils::skipExistingBigrams(mDictRoot, pos); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::skipExistingBigrams(buffer, pos); + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); } } + // Copy bigrams from the bigram list that starts at fromPos to toPos and advance these + // positions after bigram lists. This method skips invalid bigram entries. + bool copyAllBigrams(int *const fromPos, int *const toPos); + + bool addBigramEntry(const int bigramPos, const int probability, int *const pos); + + // Return if targetBigramPos is found or not. + bool removeBigram(const int bigramListPos, const int targetBigramPos); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); - const uint8_t *const mDictRoot; - const int mBufSize; - const ExtendableBuffer *const mAdditionalBuffer; + BufferWithExtendableBuffer *const mBuffer; }; } // namespace latinime #endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp index c427ebe2d..6bb90fc2d 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp @@ -19,16 +19,18 @@ #include "suggest/core/policy/dictionary_bigrams_structure_policy.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(const int nodePos, const int maxCodePointCount, int *const outCodePoints) { - const bool usesAdditionalBuffer = nodePos >= mOriginalDictSize; - const uint8_t *const dictBuf = - usesAdditionalBuffer ? mExtendableBuffer->getBuffer() : mDictRoot; - int pos = (usesAdditionalBuffer) ? nodePos - mOriginalDictSize : nodePos; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(nodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = nodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); const int parentPos = DynamicPatriciaTrieReadingUtils::getParentPosAndAdvancePosition(dictBuf, &pos); @@ -45,17 +47,13 @@ void DynamicPatriciaTrieNodeReader::fetchNodeInfoFromBufferAndProcessMovedNode(c } else { mProbability = NOT_A_PROBABILITY; } - if (hasChildren()) { - mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( - dictBuf, mFlags, &pos); - if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { - mChildrenPos += mOriginalDictSize; - } - } else { - mChildrenPos = NOT_A_DICT_POS; + mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, mFlags, &pos); + if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { + mChildrenPos += mBuffer->getOriginalBufferSize(); } if (usesAdditionalBuffer) { - pos += mOriginalDictSize; + pos += mBuffer->getOriginalBufferSize(); } if (PatriciaTrieReadingUtils::hasShortcutTargets(mFlags)) { mShortcutPos = pos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h index 2a636289e..acc68b321 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h @@ -25,9 +25,9 @@ namespace latinime { +class BufferWithExtendableBuffer; class DictionaryBigramsStructurePolicy; class DictionaryShortcutsStructurePolicy; -class ExtendableBuffer; /* * This class is used for helping to read nodes of dynamic patricia trie. This class handles moved @@ -35,12 +35,10 @@ class ExtendableBuffer; */ class DynamicPatriciaTrieNodeReader { public: - DynamicPatriciaTrieNodeReader(const uint8_t *const dictRoot, const int originalDictSize, - const ExtendableBuffer *const extendableBuffer, + DynamicPatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, const DictionaryBigramsStructurePolicy *const bigramsPolicy, const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) - : mDictRoot(dictRoot), mOriginalDictSize(originalDictSize), - mExtendableBuffer(extendableBuffer), mBigramsPolicy(bigramsPolicy), + : mBuffer(buffer), mBigramsPolicy(bigramsPolicy), mShortcutsPolicy(shortcutsPolicy), mNodePos(NOT_A_VALID_WORD_POS), mFlags(0), mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mProbability(NOT_A_PROBABILITY), mChildrenPos(NOT_A_DICT_POS), mShortcutPos(NOT_A_DICT_POS), @@ -71,7 +69,7 @@ class DynamicPatriciaTrieNodeReader { } AK_FORCE_INLINE bool hasChildren() const { - return PatriciaTrieReadingUtils::hasChildrenInFlags(mFlags); + return mChildrenPos != NOT_A_DICT_POS; } AK_FORCE_INLINE bool isTerminal() const { @@ -124,10 +122,7 @@ class DynamicPatriciaTrieNodeReader { private: DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeReader); - // TODO: Consolidate mDictRoot. - const uint8_t *const mDictRoot; - const int mOriginalDictSize; - const ExtendableBuffer *const mExtendableBuffer; + const BufferWithExtendableBuffer *const mBuffer; const DictionaryBigramsStructurePolicy *const mBigramsPolicy; const DictionaryShortcutsStructurePolicy *const mShortcutsPolicy; int mNodePos; diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp index b244dd0b5..3b9878b82 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -20,95 +20,69 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" #include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" #include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" namespace latinime { -// To avoid infinite loop caused by invalid or malicious forward links. -const int DynamicPatriciaTriePolicy::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; - void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, DicNodeVector *const childDicNodes) const { if (!dicNode->hasChildren()) { return; } - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int nextPos = dicNode->getChildrenPos(); - int totalChildCount = 0; - do { - const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mDictRoot, &nextPos); - totalChildCount += childCount; - if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", - childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); - return; - } - for (int i = 0; i < childCount; i++) { - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nextPos, MAX_WORD_LENGTH, - mergedNodeCodePoints); - if (!nodeReader.isDeleted()) { - // Push child node when the node is not a deleted node. - childDicNodes->pushLeavingChild(dicNode, nodeReader.getNodePos(), - nodeReader.getChildrenPos(), nodeReader.getProbability(), - nodeReader.isTerminal(), nodeReader.hasChildren(), - nodeReader.isBlacklisted() || nodeReader.isNotAWord(), - nodeReader.getCodePointCount(), mergedNodeCodePoints); - } - nextPos = nodeReader.getSiblingNodePos(); - } - nextPos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(mDictRoot, nextPos); - } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(nextPos)); + readingHelper.initWithNodeArrayPos(dicNode->getChildrenPos()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + childDicNodes->pushLeavingChild(dicNode, nodeReader->getNodePos(), + nodeReader->getChildrenPos(), nodeReader->getProbability(), + nodeReader->isTerminal() && !nodeReader->isDeleted(), + nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), + nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); + readingHelper.readNextSiblingNode(); + } } int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( const int nodePos, const int maxCodePointCount, int *const outCodePoints, int *const outUnigramProbability) const { - if (nodePos == NOT_A_VALID_WORD_POS) { - *outUnigramProbability = NOT_A_PROBABILITY; - return 0; - } // This method traverses parent nodes from the terminal by following parent pointers; thus, // node code points are stored in the buffer in the reverse order. int reverseCodePoints[maxCodePointCount]; - int mergedNodeCodePoints[maxCodePointCount]; - int codePointCount = 0; - - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - // First, read terminal node and get its probability. - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodePos, maxCodePointCount, - mergedNodeCodePoints); - // Store terminal node probability. - *outUnigramProbability = nodeReader.getProbability(); - // Store terminal node code points to buffer in the reverse order. - for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { - reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; + // First, read the terminal node and get its probability. + readingHelper.initWithNodePos(nodePos); + if (!readingHelper.isValidTerminalNode()) { + // Node at the nodePos is not a valid terminal node. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; } - // Then, follow parent pos toward the root node. - while (nodeReader.getParentPos() != NOT_A_DICT_POS) { - // codePointCount must be incremented at least once in each iteration to ensure preventing - // infinite loop. - if (nodeReader.isDeleted() || codePointCount > maxCodePointCount - || nodeReader.getCodePointCount() <= 0) { + // Store terminal node probability. + *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); + // Then, following parent node link to the dictionary root and fetch node code points. + while (!readingHelper.isEnd()) { + if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { // The nodePos is not a valid terminal node position in the dictionary. *outUnigramProbability = NOT_A_PROBABILITY; return 0; } - // Read parent node. - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(nodeReader.getParentPos(), - maxCodePointCount, mergedNodeCodePoints); // Store node code points to buffer in the reverse order. - for (int i = nodeReader.getCodePointCount() - 1; i >= 0; --i) { - reverseCodePoints[codePointCount++] = mergedNodeCodePoints[i]; - } + readingHelper.fetchMergedNodeCodePointsInReverseOrder( + readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); + // Follow parent node toward the root node. + readingHelper.readParentNode(); + } + if (readingHelper.isError()) { + // The node position or the dictionary is invalid. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; } // Reverse the stored code points to output them. + const int codePointCount = readingHelper.getTotalCodePointCount(); for (int i = 0; i < codePointCount; ++i) { outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; } @@ -121,73 +95,39 @@ int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const in for (int i = 0; i < length; ++i) { searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; } - int mergedNodeCodePoints[MAX_WORD_LENGTH]; - int currentLength = 0; - int pos = getRootPosition(); - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); - while (currentLength < length) { - // When foundMatchedNode becomes true, currentLength is increased at least once. - bool foundMatchedNode = false; - int totalChildCount = 0; - do { - const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( - mDictRoot, &pos); - totalChildCount += childCount; - if (childCount <= 0 || totalChildCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP) { - // Invalid dictionary. - AKLOGI("Invalid dictionary. childCount: %d, totalChildCount: %d, MAX: %d", - childCount, totalChildCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP); - ASSERT(false); + readingHelper.initWithNodeArrayPos(getRootPosition()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); + if (readingHelper.getTotalCodePointCount() > length + || !readingHelper.isMatchedCodePoint(0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readingHelper.readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!readingHelper.isMatchedCodePoint( + j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. return NOT_A_VALID_WORD_POS; } - for (int i = 0; i < childCount; i++) { - nodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(pos, MAX_WORD_LENGTH, - mergedNodeCodePoints); - const int nodeCodePointCount = nodeReader.getCodePointCount(); - if (nodeReader.isDeleted() || nodeCodePointCount <= 0 - || currentLength + nodeCodePointCount > length) { - // Skip deleted or empty node. - pos = nodeReader.getSiblingNodePos(); - continue; - } - bool matched = true; - for (int j = 0; j < nodeCodePointCount; ++j) { - if (mergedNodeCodePoints[j] != searchCodePoints[currentLength + j]) { - // Different code point is found. - matched = false; - break; - } - } - if (matched) { - currentLength += nodeCodePointCount; - if (length == currentLength) { - // Terminal position is found. - return nodeReader.getNodePos(); - } - if (!nodeReader.hasChildren()) { - return NOT_A_VALID_WORD_POS; - } - foundMatchedNode = true; - // Advance to the children nodes. - pos = nodeReader.getChildrenPos(); - break; - } - // Try next sibling node. - pos = nodeReader.getSiblingNodePos(); - } - if (foundMatchedNode) { - break; - } - // If the matched node is not found in the current PtNode array, try to follow the - // forward link. - pos = DynamicPatriciaTrieReadingUtils::getForwardLinkPosition( - mDictRoot, pos); - } while (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(pos)); - if (!foundMatchedNode) { - // Matched node is not found. + } + // All characters are matched. + if (length == readingHelper.getTotalCodePointCount()) { + // Terminal position is found. + return nodeReader->getNodePos(); + } + if (!nodeReader->hasChildren()) { return NOT_A_VALID_WORD_POS; } + // Advance to the children nodes. + readingHelper.readChildNode(); } // If we already traversed the tree further than the word is long, there means // there was no match (or we would have found it). @@ -198,7 +138,7 @@ int DynamicPatriciaTriePolicy::getUnigramProbability(const int nodePos) const { if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_PROBABILITY; } - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { @@ -211,7 +151,7 @@ int DynamicPatriciaTriePolicy::getShortcutPositionOfNode(const int nodePos) cons if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted()) { @@ -224,7 +164,7 @@ int DynamicPatriciaTriePolicy::getBigramsPositionOfNode(const int nodePos) const if (nodePos == NOT_A_VALID_WORD_POS) { return NOT_A_DICT_POS; } - DynamicPatriciaTrieNodeReader nodeReader(mDictRoot, mOriginalDictSize, &mExtendableBuffer, + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, getBigramsStructurePolicy(), getShortcutsStructurePolicy()); nodeReader.fetchNodeInfoFromBuffer(nodePos); if (nodeReader.isDeleted()) { @@ -239,8 +179,12 @@ bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); return false; } - // TODO: Implement. - return false; + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithNodeArrayPos(getRootPosition()); + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addUnigramWord(&readingHelper, word, length, probability); } bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, @@ -249,8 +193,19 @@ bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); return false; } - // TODO: Implement. - return false; + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_VALID_WORD_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_VALID_WORD_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addBigramWords(word0Pos, word1Pos, probability); } bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, @@ -259,8 +214,19 @@ bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); return false; } - // TODO: Implement. - return false; + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_VALID_WORD_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_VALID_WORD_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.removeBigramWords(word0Pos, word1Pos); } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h index 73b963212..5873d3d65 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -17,14 +17,12 @@ #ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H #define LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H -#include <stdint.h> - #include "defines.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" #include "suggest/policyimpl/dictionary/header/header_policy.h" #include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" -#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" #include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" namespace latinime { @@ -35,11 +33,11 @@ class DicNodeVector; class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { public: DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) - : mBuffer(buffer), mExtendableBuffer(), mHeaderPolicy(mBuffer->getBuffer()), - mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), - mOriginalDictSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()), - mBigramListPolicy(mDictRoot, mOriginalDictSize, &mExtendableBuffer), - mShortcutListPolicy(mDictRoot, mOriginalDictSize, &mExtendableBuffer) {} + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer()), + mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), + mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mBigramListPolicy(&mBufferWithExtendableBuffer), + mShortcutListPolicy(&mBufferWithExtendableBuffer) {} ~DynamicPatriciaTriePolicy() { delete mBuffer; @@ -87,17 +85,12 @@ class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); - static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; const MmappedBuffer *const mBuffer; - const ExtendableBuffer mExtendableBuffer; const HeaderPolicy mHeaderPolicy; - // TODO: Consolidate mDictRoot. - // CAVEAT!: Be careful about array out of bound access with mDictRoot - const uint8_t *const mDictRoot; - const int mOriginalDictSize; - const DynamicBigramListPolicy mBigramListPolicy; - const DynamicShortcutListPolicy mShortcutListPolicy; + BufferWithExtendableBuffer mBufferWithExtendableBuffer; + DynamicBigramListPolicy mBigramListPolicy; + DynamicShortcutListPolicy mShortcutListPolicy; }; } // namespace latinime #endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp new file mode 100644 index 000000000..2042fcbd2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPatriciaTrieReadingHelper::nextNodeArray() { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mPos -= mBuffer->getOriginalBufferSize(); + } + mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(dictBuf, + &mPos); + if (usesAdditionalBuffer) { + mPos += mBuffer->getOriginalBufferSize(); + } + // Count up nodes and node arrays to avoid infinite loop. + mTotalNodeCount += mNodeCount; + mNodeArrayCount++; + if (mNodeCount < 0 || mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mNodeCount, mTotalNodeCount, MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, + mNodeArrayCount, MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mPos = NOT_A_DICT_POS; + return; + } + if (mNodeCount == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPatriciaTrieReadingHelper::followForwardLink() { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mPos -= mBuffer->getOriginalBufferSize(); + } + const int forwardLinkPosition = + DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mPos); + if (usesAdditionalBuffer) { + mPos += mBuffer->getOriginalBufferSize(); + } + if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { + // Follow the forward link. + mPos = forwardLinkPosition; + nextNodeArray(); + } else { + // All node arrays have been read. + mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h new file mode 100644 index 000000000..b108ed5fb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPatriciaTrieReadingHelper { + public: + DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mIsError(false), mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), + mTotalNodeCount(0), mNodeArrayCount(0), mBuffer(buffer), + mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy) {} + + ~DynamicPatriciaTrieReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a node array. + AK_FORCE_INLINE void initWithNodeArrayPos(const int nodeArrayPos) { + if (nodeArrayPos == NOT_A_DICT_POS) { + mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mPos = nodeArrayPos; + mNodeCount = 0; + mPrevTotalCodePointCount = 0; + mTotalNodeCount = 0; + mNodeArrayCount = 0; + nextNodeArray(); + if (!isEnd()) { + fetchNodeInfo(); + } + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithNodePos(const int nodePos) { + // TODO: Consolidate NOT_A_VALID_WORD_POS and NOT_A_DICT_POS + if (nodePos == NOT_A_VALID_WORD_POS || nodePos == NOT_A_DICT_POS) { + mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mPos = nodePos; + mNodeCount = 1; + mPrevTotalCodePointCount = 0; + mTotalNodeCount = 1; + mNodeArrayCount = 1; + fetchNodeInfo(); + } + } + + AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { + return &mNodeReader; + } + + AK_FORCE_INLINE bool isValidTerminalNode() const { + return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { + return mMergedNodeCodePoints[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + return mPrevTotalCodePointCount; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE int getTotalCodePointCount() const { + return mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = mNodeReader.getCodePointCount(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { + return mMergedNodeCodePoints; + } + + AK_FORCE_INLINE void readNextSiblingNode() { + mNodeCount -= 1; + mPos = mNodeReader.getSiblingNodePos(); + if (mNodeCount <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + if (!isEnd()) { + fetchNodeInfo(); + } + } else { + fetchNodeInfo(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode() { + if (mNodeReader.hasChildren()) { + mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mTotalNodeCount = 0; + mNodeArrayCount = 0; + mPos = mNodeReader.getChildrenPos(); + // Read children node array. + nextNodeArray(); + if (!isEnd()) { + fetchNodeInfo(); + } + } else { + mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode() { + if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { + mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mTotalNodeCount = 1; + mNodeArrayCount = 1; + mNodeCount = 1; + mPos = mNodeReader.getParentPos(); + fetchNodeInfo(); + } else { + mPos = NOT_A_DICT_POS; + } + } + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + + bool mIsError; + int mPos; + // Node count of a node array. + int mNodeCount; + int mPrevTotalCodePointCount; + int mTotalNodeCount; + int mNodeArrayCount; + const BufferWithExtendableBuffer *const mBuffer; + DynamicPatriciaTrieNodeReader mNodeReader; + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + + void nextNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void fetchNodeInfo() { + mNodeReader.fetchNodeInfoFromBufferAndGetNodeCodePoints(mPos, MAX_WORD_LENGTH, + mMergedNodeCodePoints); + if (mNodeReader.getCodePointCount() <= 0) { + // Empty node is not allowed. + mIsError = true; + mPos = NOT_A_DICT_POS; + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp index 1ef3b65c3..5d979fa51 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp @@ -32,7 +32,13 @@ const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; const uint8_t *const buffer, const NodeFlags flags, int *const pos) { if ((flags & MASK_MOVED) == FLAG_IS_NOT_MOVED) { const int base = *pos; - return base + ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == 0) { + // 0 offset means that the node does not have children. + return NOT_A_DICT_POS; + } else { + return base + offset; + } } else { return NOT_A_DICT_POS; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..128d69d88 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" + +namespace latinime { + +bool DynamicPatriciaTrieWritingHelper::addUnigramWord( + DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability) { + while (!readingHelper->isEnd()) { + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper->getNodeReader(); + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount) { + // TODO: split current node after j - 1, create child and make this terminal. + return false; + } + if (!readingHelper->isMatchedCodePoint(j, + wordCodePoints[matchedCodePointCount + j])) { + // TODO: split current node after j - 1 and create two children. + return false; + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount()) { + if (nodeReader->isTerminal()) { + // TODO: Update probability. + } else { + // TODO: Make it terminal and update probability. + } + return false; + } + if (!nodeReader->hasChildren()) { + // TODO: Create children node array and add new node as a child. + return false; + } + // Advance to the children nodes. + readingHelper->readChildNode(); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + // TODO: add at the last position of the node array. + return false; +} + +bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoFromBuffer(word0Pos); + if (nodeReader.isDeleted()) { + return false; + } + // TODO: Implement. + return false; +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoFromBuffer(word0Pos); + if (nodeReader.isDeleted() || nodeReader.getBigramsPos() == NOT_A_DICT_POS) { + return false; + } + // TODO: Implement. + return false; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h new file mode 100644 index 000000000..f8165fc3d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicBigramListPolicy; +class DynamicPatriciaTrieReadingHelper; +class DynamicShortcutListPolicy; + +class DynamicPatriciaTrieWritingHelper { + public: + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy *const bigramPolicy, + DynamicShortcutListPolicy *const shortcutPolicy) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + + ~DynamicPatriciaTrieWritingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); + + BufferWithExtendableBuffer *const mBuffer; + DynamicBigramListPolicy *const mBigramPolicy; + DynamicShortcutListPolicy *const mShortcutPolicy; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h index fdaf18f7c..5e9c52950 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h @@ -22,7 +22,7 @@ #include "defines.h" #include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" #include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" -#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { @@ -31,9 +31,8 @@ namespace latinime { */ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { public: - DynamicShortcutListPolicy(const uint8_t *const shortcutBuf, const int bufSize, - const ExtendableBuffer *const additionalBuffer) - : mShortcutsBuf(shortcutBuf), mBufSize(bufSize), mAdditionalBuffer(additionalBuffer) {} + explicit DynamicShortcutListPolicy(BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} ~DynamicShortcutListPolicy() {} @@ -47,11 +46,10 @@ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, int *const pos) const { - const bool usesAdditionalBuffer = *pos >= mBufSize; - const uint8_t *const buffer = usesAdditionalBuffer - ? mAdditionalBuffer->getBuffer() : mShortcutsBuf; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); if (usesAdditionalBuffer) { - *pos -= mBufSize; + *pos -= mBuffer->getOriginalBufferSize(); } const ShortcutListReadingUtils::ShortcutFlags flags = ShortcutListReadingUtils::getFlagsAndForwardPointer(buffer, pos); @@ -66,29 +64,51 @@ class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { buffer, maxCodePointCount, outCodePoint, pos); } if (usesAdditionalBuffer) { - *pos += mBufSize; + *pos += mBuffer->getOriginalBufferSize(); } } void skipAllShortcuts(int *const pos) const { - if (*pos >= mBufSize) { - *pos -= mBufSize; - const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(mAdditionalBuffer->getBuffer(), pos); - *pos += mBufSize + shortcutListSize; - } else { - const int shortcutListSize = ShortcutListReadingUtils - ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos); - *pos += shortcutListSize; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + // Copy shortcuts from the shortcut list that starts at fromPos to toPos and advance these + // positions after the shortcut lists. + void copyAllShortcuts(int *const fromPos, int *const toPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, fromPos); + // Copy shortcut list size. + mBuffer->writeUintAndAdvancePosition( + shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos); + for (int i = 0; i < shortcutListSize; ++i) { + const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition(buffer, fromPos); + mBuffer->writeUintAndAdvancePosition(data, 1 /* size */, toPos); + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); } } private: DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); - const uint8_t *const mShortcutsBuf; - const int mBufSize; - const ExtendableBuffer *const mAdditionalBuffer; + BufferWithExtendableBuffer *const mBuffer; }; } // namespace latinime #endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp index e55cc2410..8582c4b81 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/extendable_buffer.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -14,12 +14,12 @@ * limitations under the License. */ -#include "suggest/policyimpl/dictionary/utils/extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" namespace latinime { -const size_t ExtendableBuffer::INITIAL_BUFFER_SIZE = 16 * 1024; -const size_t ExtendableBuffer::MAX_BUFFER_SIZE = 1024 * 1024; -const size_t ExtendableBuffer::EXTEND_BUFFER_SIZE_STEP = 16 * 1024; +const size_t BufferWithExtendableBuffer::INITIAL_ADDITIONAL_BUFFER_SIZE = 16 * 1024; +const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 16 * 1024; } diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..ec871ec85 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include <cstddef> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize) + : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), + mAdditionalBuffer(INITIAL_ADDITIONAL_BUFFER_SIZE), mUsedAdditionalBufferSize(0) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBufferSize + mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= mOriginalBufferSize; + } + + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return &mAdditionalBuffer[0]; + } else { + return mOriginalBuffer; + } + } + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBufferSize; + } + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + AK_FORCE_INLINE bool writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const size_t INITIAL_ADDITIONAL_BUFFER_SIZE; + static const size_t MAX_ADDITIONAL_BUFFER_SIZE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + uint8_t *const mOriginalBuffer; + const int mOriginalBufferSize; + std::vector<uint8_t> mAdditionalBuffer; + int mUsedAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + AK_FORCE_INLINE bool extendBuffer() { + if (mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP + > MAX_ADDITIONAL_BUFFER_SIZE) { + return false; + } + mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); + return true; + } + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + AK_FORCE_INLINE bool checkAndPrepareWriting(const int pos, const int size) { + if (isInAdditionalBuffer(pos)) { + if (pos == mUsedAdditionalBufferSize) { + // Append data to the tail. + if (pos + size > static_cast<int>(mAdditionalBuffer.size())) { + // Need to extend buffer. + if (!extendBuffer()) { + return false; + } + } + mUsedAdditionalBufferSize += size; + } else if (pos + size >= mUsedAdditionalBufferSize) { + // The access will beyond the tail of used region. + return false; + } + } else { + if (pos < 0 || mOriginalBufferSize < pos + size) { + // Invalid position or violate the boundary. + return false; + } + } + return true; + } +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h index 75ccfc766..1d14929c7 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -29,7 +29,34 @@ namespace latinime { class ByteArrayUtils { public: /** - * Integer + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading * * Each method read a corresponding size integer in a big endian manner. */ @@ -187,6 +214,32 @@ class ByteArrayUtils { static const uint8_t MINIMAL_ONE_BYTE_CHARACTER_VALUE; static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } }; } // namespace latinime #endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/extendable_buffer.h deleted file mode 100644 index 5c75027d2..000000000 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/extendable_buffer.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2013, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_EXTENDABLE_BUFFER_H -#define LATINIME_EXTENDABLE_BUFFER_H - -#include <cstddef> -#include <stdint.h> -#include <vector> - -#include "defines.h" - -namespace latinime { - -// This is used as a buffer that can be extended for updatable dictionaries. -class ExtendableBuffer { - public: - ExtendableBuffer() : mBuffer(INITIAL_BUFFER_SIZE), mUsedSize(0) {} - - AK_FORCE_INLINE const uint8_t *getBuffer() const { - return &mBuffer[0]; - } - - // Return if the buffer is successfully extended or not. - AK_FORCE_INLINE bool extendBuffer() { - if (mBuffer.size() + EXTEND_BUFFER_SIZE_STEP > MAX_BUFFER_SIZE) { - return false; - } - mBuffer.resize(mBuffer.size() + EXTEND_BUFFER_SIZE_STEP); - return true; - } - - AK_FORCE_INLINE int getAllocatedSize() const { - return mBuffer.size(); - } - - AK_FORCE_INLINE int getUsedSize() const { - return mUsedSize; - } - - AK_FORCE_INLINE void clear() { - mUsedSize = 0; - mBuffer.clear(); - } - - private: - DISALLOW_COPY_AND_ASSIGN(ExtendableBuffer); - - static const size_t INITIAL_BUFFER_SIZE; - static const size_t MAX_BUFFER_SIZE; - static const size_t EXTEND_BUFFER_SIZE_STEP; - - std::vector<uint8_t> mBuffer; - int mUsedSize; -}; -} -#endif /* LATINIME_MMAPED_BUFFER_H */ |