diff options
Diffstat (limited to 'native/jni/src/suggest/policyimpl')
51 files changed, 6073 insertions, 57 deletions
diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h new file mode 100644 index 000000000..6ff95cac4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_policy.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_POLICY_H +#define LATINIME_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +namespace latinime { + +class BigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + explicit BigramListPolicy(const uint8_t *const bigramsBuf) : mBigramsBuf(bigramsBuf) {} + + ~BigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const pos) const { + BigramListReadWriteUtils::BigramFlags flags; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(mBigramsBuf, &flags, + outBigramPos, pos); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(flags); + *outHasNext = BigramListReadWriteUtils::hasNext(flags); + } + + void skipAllBigrams(int *const pos) const { + BigramListReadWriteUtils::skipExistingBigrams(mBigramsBuf, pos); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListPolicy); + + const uint8_t *const mBigramsBuf; +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp new file mode 100644 index 000000000..1926b9831 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::MASK_ATTRIBUTE_ADDRESS_TYPE = + 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30; +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; +// Flag for presence of more attributes +const BigramListReadWriteUtils::BigramFlags BigramListReadWriteUtils::FLAG_ATTRIBUTE_HAS_NEXT = + 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const BigramListReadWriteUtils::BigramFlags + BigramListReadWriteUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int BigramListReadWriteUtils::ATTRIBUTE_ADDRESS_SHIFT = 4; + +/* static */ void BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + const uint8_t *const bigramsBuf, BigramFlags *const outBigramFlags, + int *const outTargetPtNodePos, int *const bigramEntryPos) { + const BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, + bigramEntryPos); + if (outBigramFlags) { + *outBigramFlags = bigramFlags; + } + const int targetPos = getBigramAddressAndAdvancePosition(bigramsBuf, bigramFlags, + bigramEntryPos); + if (outTargetPtNodePos) { + *outTargetPtNodePos = targetPos; + } +} + +/* static */ void BigramListReadWriteUtils::skipExistingBigrams(const uint8_t *const bigramsBuf, + int *const bigramListPos) { + BigramFlags flags; + do { + getBigramEntryPropertiesAndAdvancePosition(bigramsBuf, &flags, 0 /* outTargetPtNodePos */, + bigramListPos); + } while(hasNext(flags)); +} + +/* static */ int BigramListReadWriteUtils::getBigramAddressAndAdvancePosition( + const uint8_t *const bigramsBuf, const BigramFlags flags, int *const pos) { + int offset = 0; + const int origin = *pos; + switch (MASK_ATTRIBUTE_ADDRESS_TYPE & flags) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(bigramsBuf, pos); + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(bigramsBuf, pos); + break; + } + if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (offset == DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET) { + return origin; + } + if (isOffsetNegative(flags)) { + return origin - offset; + } else { + return origin + offset; + } +} + +/* static */ bool BigramListReadWriteUtils::setHasNextFlag( + BufferWithExtendableBuffer *const buffer, const bool hasNext, const int entryPos) { + const bool usesAdditionalBuffer = buffer->isInAdditionalBuffer(entryPos); + int readingPos = entryPos; + if (usesAdditionalBuffer) { + readingPos -= buffer->getOriginalBufferSize(); + } + BigramFlags bigramFlags = ByteArrayUtils::readUint8AndAdvancePosition( + buffer->getBuffer(usesAdditionalBuffer), &readingPos); + if (hasNext) { + bigramFlags = bigramFlags | FLAG_ATTRIBUTE_HAS_NEXT; + } else { + bigramFlags = bigramFlags & (~FLAG_ATTRIBUTE_HAS_NEXT); + } + int writingPos = entryPos; + return buffer->writeUintAndAdvancePosition(bigramFlags, 1 /* size */, &writingPos); +} + +/* static */ bool BigramListReadWriteUtils::createAndWriteBigramEntry( + BufferWithExtendableBuffer *const buffer, const int targetPos, const int probability, + const bool hasNext, int *const writingPos) { + BigramFlags flags; + if (!createAndGetBigramFlags(*writingPos, targetPos, probability, hasNext, &flags)) { + return false; + } + return writeBigramEntry(buffer, flags, targetPos, writingPos); +} + +/* static */ bool BigramListReadWriteUtils::writeBigramEntry( + BufferWithExtendableBuffer *const bufferToWrite, const BigramFlags flags, + const int targetPtNodePos, int *const writingPos) { + const int offset = getBigramTargetOffset(targetPtNodePos, *writingPos); + const BigramFlags flagsToWrite = (offset < 0) ? + (flags | FLAG_ATTRIBUTE_OFFSET_NEGATIVE) : (flags & ~FLAG_ATTRIBUTE_OFFSET_NEGATIVE); + if (!bufferToWrite->writeUintAndAdvancePosition(flagsToWrite, 1 /* size */, writingPos)) { + return false; + } + const uint32_t absOffest = abs(offset); + const int bigramTargetFieldSize = attributeAddressSize(flags); + return bufferToWrite->writeUintAndAdvancePosition(absOffest, bigramTargetFieldSize, + writingPos); +} + +// Returns true if the bigram entry is valid and put entry flags into out*. +/* static */ bool BigramListReadWriteUtils::createAndGetBigramFlags(const int entryPos, + const int targetPtNodePos, const int probability, const bool hasNext, + BigramFlags *const outBigramFlags) { + BigramFlags flags = probability & MASK_ATTRIBUTE_PROBABILITY; + if (hasNext) { + flags |= FLAG_ATTRIBUTE_HAS_NEXT; + } + const int offset = getBigramTargetOffset(targetPtNodePos, entryPos); + if (offset < 0) { + flags |= FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + } + const uint32_t absOffest = abs(offset); + if ((absOffest >> 24) != 0) { + // Offset is too large. + return false; + } else if ((absOffest >> 16) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + } else if ((absOffest >> 8) != 0) { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + } else { + flags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + } + // Currently, all newly written bigram position fields are 3 bytes to simplify dictionary + // writing. + // TODO: Remove following 2 lines and optimize memory space. + flags = (flags & (~MASK_ATTRIBUTE_ADDRESS_TYPE)) | FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + *outBigramFlags = flags; + return true; +} + +/* static */ int BigramListReadWriteUtils::getBigramTargetOffset(const int targetPtNodePos, + const int entryPos) { + if (targetPtNodePos == NOT_A_DICT_POS) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else { + const int offset = targetPtNodePos - (entryPos + 1 /* bigramFlagsField */); + if (offset == 0) { + return DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } else { + return offset; + } + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h new file mode 100644 index 000000000..eabe4e099 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H +#define LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H + +#include <cstdlib> +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class BigramListReadWriteUtils { +public: + typedef uint8_t BigramFlags; + + static void getBigramEntryPropertiesAndAdvancePosition(const uint8_t *const bigramsBuf, + BigramFlags *const outBigramFlags, int *const outTargetPtNodePos, + int *const bigramEntryPos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const BigramFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // Bigrams reading methods + static void skipExistingBigrams(const uint8_t *const bigramsBuf, int *const bigramListPos); + + // Returns the size of the bigram position field that is stored in bigram flags. + static AK_FORCE_INLINE int attributeAddressSize(const BigramFlags flags) { + return (flags & MASK_ATTRIBUTE_ADDRESS_TYPE) >> ATTRIBUTE_ADDRESS_SHIFT; + /* Note: this is a value-dependant optimization of what may probably be + more readably written this way: + switch (flags * BinaryFormat::MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: return 1; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: return 2; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTE: return 3; + default: return 0; + } + */ + } + + static bool setHasNextFlag(BufferWithExtendableBuffer *const buffer, + const bool hasNext, const int entryPos); + + static AK_FORCE_INLINE BigramFlags setProbabilityInFlags(const BigramFlags flags, + const int probability) { + return (flags & (~MASK_ATTRIBUTE_PROBABILITY)) | (probability & MASK_ATTRIBUTE_PROBABILITY); + } + + static bool createAndWriteBigramEntry(BufferWithExtendableBuffer *const buffer, + const int targetPos, const int probability, const bool hasNext, int *const writingPos); + + static bool writeBigramEntry(BufferWithExtendableBuffer *const buffer, const BigramFlags flags, + const int targetOffset, int *const writingPos); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(BigramListReadWriteUtils); + + static const BigramFlags MASK_ATTRIBUTE_ADDRESS_TYPE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES; + static const BigramFlags FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES; + static const BigramFlags FLAG_ATTRIBUTE_OFFSET_NEGATIVE; + static const BigramFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const BigramFlags MASK_ATTRIBUTE_PROBABILITY; + static const int ATTRIBUTE_ADDRESS_SHIFT; + + // Returns true if the bigram entry is valid and put entry flags into out*. + static bool createAndGetBigramFlags(const int entryPos, const int targetPos, + const int probability, const bool hasNext, BigramFlags *const outBigramFlags); + + static AK_FORCE_INLINE bool isOffsetNegative(const BigramFlags flags) { + return (flags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) != 0; + } + + static int getBigramAddressAndAdvancePosition(const uint8_t *const bigramsBuf, + const BigramFlags flags, int *const pos); + + static int getBigramTargetOffset(const int targetPtNodePos, const int entryPos); +}; +} // namespace latinime +#endif // LATINIME_BIGRAM_LIST_READ_WRITE_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp new file mode 100644 index 000000000..29307b56a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.cpp @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" + +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_read_write_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const int DynamicBigramListPolicy::CONTINUING_BIGRAM_LINK_COUNT_LIMIT = 10000; +const int DynamicBigramListPolicy::BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT = 100000; + +void DynamicBigramListPolicy::getNextBigram(int *const outBigramPos, int *const outProbability, + bool *const outHasNext, int *const bigramEntryPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramEntryPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *bigramEntryPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition(buffer, &bigramFlags, + &originalBigramPos, bigramEntryPos); + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + *outBigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + *outProbability = BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags); + *outHasNext = BigramListReadWriteUtils::hasNext(bigramFlags); + if (usesAdditionalBuffer) { + *bigramEntryPos += mBuffer->getOriginalBufferSize(); + } +} + +void DynamicBigramListPolicy::skipAllBigrams(int *const bigramListPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::skipExistingBigrams(buffer, bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos += mBuffer->getOriginalBufferSize(); + } +} + +bool DynamicBigramListPolicy::copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos, int *const outBigramsCount) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + *outBigramsCount = 0; + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + int lastWrittenEntryPos = NOT_A_DICT_POS; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + // The buffer address can be changed after calling buffer writing methods. + int originalBigramPos; + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + fromPos); + if (originalBigramPos == NOT_A_DICT_POS) { + // skip invalid bigram entry. + continue; + } + if (usesAdditionalBuffer) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + if (bigramPos == NOT_A_DICT_POS) { + // Target PtNode has been invalidated. + continue; + } + lastWrittenEntryPos = *toPos; + if (!BigramListReadWriteUtils::createAndWriteBigramEntry(bufferToWrite, bigramPos, + BigramListReadWriteUtils::getProbabilityFromFlags(bigramFlags), + BigramListReadWriteUtils::hasNext(bigramFlags), toPos)) { + return false; + } + (*outBigramsCount)++; + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + // Makes the last entry the terminal of the list. Updates the flags. + if (lastWrittenEntryPos != NOT_A_DICT_POS) { + if (!BigramListReadWriteUtils::setHasNextFlag(bufferToWrite, false /* hasNext */, + lastWrittenEntryPos)) { + return false; + } + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; +} + +// Finding useless bigram entries and remove them. Bigram entry is useless when the target PtNode +// has been deleted or is not a valid terminal. +bool DynamicBigramListPolicy::updateAllBigramEntriesAndDeleteUselessEntries( + int *const bigramListPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + if (originalBigramPos == NOT_A_DICT_POS) { + // This entry has already been removed. + continue; + } + if (usesAdditionalBuffer) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramTargetNodePos = + followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(bigramTargetNodePos); + // TODO: Update probability for supporting probability decaying. + if (nodeReader.isDeleted() || !nodeReader.isTerminal() + || bigramTargetNodePos == NOT_A_DICT_POS) { + // The target is no longer valid terminal. Invalidate the current bigram entry. + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos)) { + return false; + } + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return true; +} + +// Updates bigram target PtNode positions in the list after the placing step in GC. +bool DynamicBigramListPolicy::updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = *bigramListPos; + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + int bigramTargetPtNodePos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &bigramTargetPtNodePos, + bigramListPos); + if (bigramTargetPtNodePos == NOT_A_DICT_POS) { + continue; + } + if (usesAdditionalBuffer) { + bigramTargetPtNodePos += mBuffer->getOriginalBufferSize(); + } + + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + ptNodePositionRelocationMap->find(bigramTargetPtNodePos); + if (it != ptNodePositionRelocationMap->end()) { + bigramTargetPtNodePos = it->second; + } else { + bigramTargetPtNodePos = NOT_A_DICT_POS; + } + if (!BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + bigramTargetPtNodePos, &bigramEntryPos)) { + return false; + } + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return true; +} + +bool DynamicBigramListPolicy::addNewBigramEntryToBigramList(const int bigramTargetPos, + const int probability, int *const bigramListPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*bigramListPos); + if (usesAdditionalBuffer) { + *bigramListPos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int entryPos = *bigramListPos; + if (usesAdditionalBuffer) { + entryPos += mBuffer->getOriginalBufferSize(); + } + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, + bigramListPos); + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + if (followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos) == bigramTargetPos) { + // Update this bigram entry. + const BigramListReadWriteUtils::BigramFlags updatedFlags = + BigramListReadWriteUtils::setProbabilityInFlags(bigramFlags, probability); + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, updatedFlags, + originalBigramPos, &entryPos); + } + if (BigramListReadWriteUtils::hasNext(bigramFlags)) { + continue; + } + // The current last entry is found. + // First, update the flags of the last entry. + if (!BigramListReadWriteUtils::setHasNextFlag(mBuffer, true /* hasNext */, entryPos)) { + return false; + } + if (usesAdditionalBuffer) { + *bigramListPos += mBuffer->getOriginalBufferSize(); + } + // Then, add a new entry after the last entry. + return writeNewBigramEntry(bigramTargetPos, probability, bigramListPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + // We return directly from the while loop. + ASSERT(false); + return false; +} + +bool DynamicBigramListPolicy::writeNewBigramEntry(const int bigramTargetPos, const int probability, + int *const writingPos) { + // hasNext is false because we are adding a new bigram entry at the end of the bigram list. + return BigramListReadWriteUtils::createAndWriteBigramEntry(mBuffer, bigramTargetPos, + probability, false /* hasNext */, writingPos); +} + +bool DynamicBigramListPolicy::removeBigram(const int bigramListPos, const int bigramTargetPos) { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(bigramListPos); + int pos = bigramListPos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + BigramListReadWriteUtils::BigramFlags bigramFlags; + int bigramEntryCount = 0; + do { + if (++bigramEntryCount > BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT) { + AKLOGE("Too many bigram entries. Entry count: %d, Limit: %d", + bigramEntryCount, BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT); + ASSERT(false); + return false; + } + int bigramEntryPos = pos; + int originalBigramPos; + // The buffer address can be changed after calling buffer writing methods. + BigramListReadWriteUtils::getBigramEntryPropertiesAndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), &bigramFlags, &originalBigramPos, &pos); + if (usesAdditionalBuffer) { + bigramEntryPos += mBuffer->getOriginalBufferSize(); + } + if (usesAdditionalBuffer && originalBigramPos != NOT_A_DICT_POS) { + originalBigramPos += mBuffer->getOriginalBufferSize(); + } + const int bigramPos = followBigramLinkAndGetCurrentBigramPtNodePos(originalBigramPos); + if (bigramPos != bigramTargetPos) { + continue; + } + // Target entry is found. Write an invalid target position to mark the bigram invalid. + return BigramListReadWriteUtils::writeBigramEntry(mBuffer, bigramFlags, + NOT_A_DICT_POS /* targetOffset */, &bigramEntryPos); + } while(BigramListReadWriteUtils::hasNext(bigramFlags)); + return false; +} + +int DynamicBigramListPolicy::followBigramLinkAndGetCurrentBigramPtNodePos( + const int originalBigramPos) const { + if (originalBigramPos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int currentPos = originalBigramPos; + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, this /* bigramsPolicy */, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); + int bigramLinkCount = 0; + while (nodeReader.getBigramLinkedNodePos() != NOT_A_DICT_POS) { + currentPos = nodeReader.getBigramLinkedNodePos(); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(currentPos); + bigramLinkCount++; + if (bigramLinkCount > CONTINUING_BIGRAM_LINK_COUNT_LIMIT) { + AKLOGE("Bigram link is invalid. start position: %d", originalBigramPos); + ASSERT(false); + return NOT_A_DICT_POS; + } + } + return currentPos; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h new file mode 100644 index 000000000..8ea318a41 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H +#define LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryShortcutsStructurePolicy; + +/* + * This is a dynamic version of BigramListPolicy and supports an additional buffer. + */ +class DynamicBigramListPolicy : public DictionaryBigramsStructurePolicy { + public: + DynamicBigramListPolicy(BufferWithExtendableBuffer *const buffer, + const DictionaryShortcutsStructurePolicy *const shortcutPolicy) + : mBuffer(buffer), mShortcutPolicy(shortcutPolicy) {} + + ~DynamicBigramListPolicy() {} + + void getNextBigram(int *const outBigramPos, int *const outProbability, bool *const outHasNext, + int *const bigramEntryPos) const; + + void skipAllBigrams(int *const bigramListPos) const; + + // Copy bigrams from the bigram list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after bigram lists. This method skips invalid + // bigram entries and write the valid bigram entry count to outBigramsCount. + bool copyAllBigrams(BufferWithExtendableBuffer *const bufferToWrite, int *const fromPos, + int *const toPos, int *const outBigramsCount) const; + + bool updateAllBigramEntriesAndDeleteUselessEntries(int *const bigramListPos); + + bool updateAllBigramTargetPtNodePositions(int *const bigramListPos, + const DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap *const + ptNodePositionRelocationMap); + + bool addNewBigramEntryToBigramList(const int bigramTargetPos, const int probability, + int *const bigramListPos); + + bool writeNewBigramEntry(const int bigramTargetPos, const int probability, + int *const writingPos); + + // Return if targetBigramPos is found or not. + bool removeBigram(const int bigramListPos, const int bigramTargetPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicBigramListPolicy); + + static const int CONTINUING_BIGRAM_LINK_COUNT_LIMIT; + static const int BIGRAM_ENTRY_COUNT_IN_A_BIGRAM_LIST_LIMIT; + + BufferWithExtendableBuffer *const mBuffer; + const DictionaryShortcutsStructurePolicy *const mShortcutPolicy; + + // Follow bigram link and return the position of bigram target PtNode that is currently valid. + int followBigramLinkAndGetCurrentBigramPtNodePos(const int originalBigramPos) const; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_BIGRAM_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp new file mode 100644 index 000000000..ff80dd2f6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h" + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +/* static */ DictionaryStructureWithBufferPolicy *DictionaryStructureWithBufferPolicyFactory + ::newDictionaryStructureWithBufferPolicy(const char *const path, const int bufOffset, + const int size, const bool isUpdatable) { + // Allocated buffer in MmapedBuffer::openBuffer() will be freed in the destructor of + // impl classes of DictionaryStructureWithBufferPolicy. + const MmappedBuffer *const mmapedBuffer = MmappedBuffer::openBuffer(path, bufOffset, size, + isUpdatable); + if (!mmapedBuffer) { + return 0; + } + switch (FormatUtils::detectFormatVersion(mmapedBuffer->getBuffer(), + mmapedBuffer->getBufferSize())) { + case FormatUtils::VERSION_2: + return new PatriciaTriePolicy(mmapedBuffer); + case FormatUtils::VERSION_3: + return new DynamicPatriciaTriePolicy(mmapedBuffer); + default: + AKLOGE("DICT: dictionary format is unknown, bad magic number"); + delete mmapedBuffer; + ASSERT(false); + return 0; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h new file mode 100644 index 000000000..8cebc3b16 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dictionary_structure_with_buffer_policy_factory.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H +#define LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" + +namespace latinime { + +class DictionaryStructureWithBufferPolicyFactory { + public: + static DictionaryStructureWithBufferPolicy *newDictionaryStructureWithBufferPolicy( + const char *const path, const int bufOffset, const int size, const bool isUpdatable); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictionaryStructureWithBufferPolicyFactory); +}; +} // namespace latinime +#endif // LATINIME_DICTIONARY_STRUCTURE_WITH_BUFFER_POLICY_FACTORY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp new file mode 100644 index 000000000..c60e45819 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" + +namespace latinime { + +bool DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // PtNode is useless when the PtNode is not a terminal and doesn't have any not useless + // children. + bool isUselessPtNode = !node->isTerminal(); + if (mChildrenValue > 0) { + isUselessPtNode = false; + } else if (node->isTerminal()) { + // Remove children as all children are useless. + int writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + mBuffer, NOT_A_DICT_POS /* childrenPosition */, &writingPos)) { + return false; + } + } + if (isUselessPtNode) { + // Current PtNode is no longer needed. Mark it as deleted. + if (!mWritingHelper->markNodeAsDeleted(node)) { + return false; + } + } else { + valueStack.back() += 1; + } + return true; +} + +// Writes dummy PtNode array size when the head of PtNode array is read. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onDescend(const int ptNodeArrayPos) { + mValidPtNodeCount = 0; + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::value_type( + ptNodeArrayPos, writingPos)); + // Writes dummy PtNode array size because arrays can have a forward link or needles PtNodes. + // This field will be updated later in onReadingPtNodeArrayTail() with actual PtNode count. + mPtNodeArraySizeFieldPos = writingPos; + return DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, 0 /* arraySize */, &writingPos); +} + +// Write PtNode array terminal and actual PtNode array size. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onReadingPtNodeArrayTail() { + int writingPos = mBufferToWrite->getTailPosition(); + // Write PtNode array terminal. + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( + mBufferToWrite, NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Write actual PtNode array size. + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + mBufferToWrite, mValidPtNodeCount, &mPtNodeArraySizeFieldPos)) { + return false; + } + return true; +} + +// Write valid PtNode to buffer and memorize mapping from the old position to the new position. +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (node->isDeleted()) { + // Current PtNode is not written in new buffer because it has been deleted. + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), NOT_A_DICT_POS)); + return true; + } + int writingPos = mBufferToWrite->getTailPosition(); + mDictPositionRelocationMap->mPtNodePositionRelocationMap.insert( + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::value_type( + node->getHeadPos(), writingPos)); + mValidPtNodeCount++; + // Writes current PtNode. + return mWritingHelper->writePtNodeToBufferByCopyingPtNodeInfo(mBufferToWrite, node, + node->getParentPos(), nodeCodePoints, node->getCodePointCount(), + node->getProbability(), &writingPos); +} + +bool DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + ::onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + // Updates parent position. + int parentPos = node->getParentPos(); + if (parentPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodePositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodePositionRelocationMap.find(parentPos); + if (it != mDictPositionRelocationMap->mPtNodePositionRelocationMap.end()) { + parentPos = it->second; + } + } + int writingPos = node->getHeadPos() + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + // Write updated parent offset. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(mBufferToWrite, + parentPos, node->getHeadPos(), &writingPos)) { + return false; + } + + // Updates children position. + int childrenPos = node->getChildrenPos(); + if (childrenPos != NOT_A_DICT_POS) { + DynamicPatriciaTrieWritingHelper::PtNodeArrayPositionRelocationMap::const_iterator it = + mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.find(childrenPos); + if (it != mDictPositionRelocationMap->mPtNodeArrayPositionRelocationMap.end()) { + childrenPos = it->second; + } + } + writingPos = node->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBufferToWrite, + childrenPos, &writingPos)) { + return false; + } + + // Updates bigram target PtNode positions in the bigram list. + int bigramsPos = node->getBigramsPos(); + if (bigramsPos != NOT_A_DICT_POS) { + if (!mBigramPolicy->updateAllBigramTargetPtNodePositions(&bigramsPos, + &mDictPositionRelocationMap->mPtNodePositionRelocationMap)) { + return false; + } + } + + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h new file mode 100644 index 000000000..4256f22fb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h @@ -0,0 +1,178 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H + +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class DynamicPatriciaTrieGcEventListeners { + public: + // Updates all PtNodes that can be reached from the root. Checks if each PtNode is useless or + // not and marks useless PtNodes as deleted. Such deleted PtNodes will be discarded in the GC. + // TODO: Concatenate non-terminal PtNodes. + class TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const buffer) + : mWritingHelper(writingHelper), mBuffer(buffer), valueStack(), + mChildrenValue(0) {} + + ~TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted() {}; + + bool onAscend() { + if (valueStack.empty()) { + return false; + } + mChildrenValue = valueStack.back(); + valueStack.pop_back(); + return true; + } + + bool onDescend(const int ptNodeArrayPos) { + valueStack.push_back(0); + return true; + } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS( + TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBuffer; + std::vector<int> valueStack; + int mChildrenValue; + }; + + // Updates all bigram entries that are held by valid PtNodes. This removes useless bigram + // entries. + class TraversePolicyToUpdateBigramProbability + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateBigramProbability(DynamicBigramListPolicy *const bigramPolicy) + : mBigramPolicy(bigramPolicy) {} + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) { + if (!node->isDeleted()) { + int pos = node->getBigramsPos(); + if (pos != NOT_A_DICT_POS) { + if (!mBigramPolicy->updateAllBigramEntriesAndDeleteUselessEntries(&pos)) { + return false; + } + } + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateBigramProbability); + + DynamicBigramListPolicy *const mBigramPolicy; + }; + + class TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToPlaceAndWriteValidPtNodesToBuffer( + DynamicPatriciaTrieWritingHelper *const writingHelper, + BufferWithExtendableBuffer *const bufferToWrite, + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap), mValidPtNodeCount(0), + mPtNodeArraySizeFieldPos(NOT_A_DICT_POS) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos); + + bool onReadingPtNodeArrayTail(); + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToPlaceAndWriteValidPtNodesToBuffer); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + BufferWithExtendableBuffer *const mBufferToWrite; + DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + int mValidPtNodeCount; + int mPtNodeArraySizeFieldPos; + }; + + class TraversePolicyToUpdateAllPositionFields + : public DynamicPatriciaTrieReadingHelper::TraversingEventListener { + public: + TraversePolicyToUpdateAllPositionFields( + DynamicPatriciaTrieWritingHelper *const writingHelper, + DynamicBigramListPolicy *const bigramPolicy, + BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + dictPositionRelocationMap) + : mWritingHelper(writingHelper), mBigramPolicy(bigramPolicy), + mBufferToWrite(bufferToWrite), + mDictPositionRelocationMap(dictPositionRelocationMap) {}; + + bool onAscend() { return true; } + + bool onDescend(const int ptNodeArrayPos) { return true; } + + bool onReadingPtNodeArrayTail() { return true; } + + bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(TraversePolicyToUpdateAllPositionFields); + + DynamicPatriciaTrieWritingHelper *const mWritingHelper; + DynamicBigramListPolicy *const mBigramPolicy; + BufferWithExtendableBuffer *const mBufferToWrite; + const DynamicPatriciaTrieWritingHelper::DictPositionRelocationMap *const + mDictPositionRelocationMap; + }; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieGcEventListeners); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_GC_EVENT_LISTENERS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp new file mode 100644 index 000000000..456352c17 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" + +#include "suggest/core/policy/dictionary_bigrams_structure_policy.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +void DynamicPatriciaTrieNodeReader::fetchPtNodeInfoFromBufferAndProcessMovedPtNode( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { + if (ptNodePos < 0 || ptNodePos >= mBuffer->getTailPosition()) { + AKLOGE("Fetching PtNode info form invalid dictionary position: %d, dictionary size: %d", + ptNodePos, mBuffer->getTailPosition()); + ASSERT(false); + invalidatePtNodeInfo(); + return; + } + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(ptNodePos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + int pos = ptNodePos; + mHeadPos = ptNodePos; + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + mFlags = PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const int parentPosOffset = + DynamicPatriciaTrieReadingUtils::getParentPtNodePosOffsetAndAdvancePosition(dictBuf, + &pos); + mParentPos = DynamicPatriciaTrieReadingUtils::getParentPtNodePos(parentPosOffset, mHeadPos); + if (outCodePoints != 0) { + mCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + dictBuf, mFlags, maxCodePointCount, outCodePoints, &pos); + } else { + mCodePointCount = PatriciaTrieReadingUtils::skipCharacters( + dictBuf, mFlags, MAX_WORD_LENGTH, &pos); + } + if (isTerminal()) { + mProbabilityFieldPos = pos; + if (usesAdditionalBuffer) { + mProbabilityFieldPos += mBuffer->getOriginalBufferSize(); + } + mProbability = PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(dictBuf, &pos); + } else { + mProbabilityFieldPos = NOT_A_DICT_POS; + mProbability = NOT_A_PROBABILITY; + } + mChildrenPosFieldPos = pos; + if (usesAdditionalBuffer) { + mChildrenPosFieldPos += mBuffer->getOriginalBufferSize(); + } + mChildrenPos = DynamicPatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + dictBuf, &pos); + if (usesAdditionalBuffer && mChildrenPos != NOT_A_DICT_POS) { + mChildrenPos += mBuffer->getOriginalBufferSize(); + } + if (mSiblingPos == NOT_A_DICT_POS) { + if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { + mBigramLinkedNodePos = mChildrenPos; + } else { + mBigramLinkedNodePos = NOT_A_DICT_POS; + } + } + if (usesAdditionalBuffer) { + pos += mBuffer->getOriginalBufferSize(); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(mFlags)) { + mShortcutPos = pos; + mShortcutsPolicy->skipAllShortcuts(&pos); + } else { + mShortcutPos = NOT_A_DICT_POS; + } + if (PatriciaTrieReadingUtils::hasBigrams(mFlags)) { + mBigramPos = pos; + mBigramsPolicy->skipAllBigrams(&pos); + } else { + mBigramPos = NOT_A_DICT_POS; + } + // Update siblingPos if needed. + if (mSiblingPos == NOT_A_DICT_POS) { + // Sibling position is the tail position of current node. + mSiblingPos = pos; + } + // Read destination node if the read node is a moved node. + if (DynamicPatriciaTrieReadingUtils::isMoved(mFlags)) { + // The destination position is stored at the same place as the parent position. + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(mParentPos, maxCodePointCount, + outCodePoints); + } +} + +void DynamicPatriciaTrieNodeReader::invalidatePtNodeInfo() { + mHeadPos = NOT_A_DICT_POS; + mFlags = 0; + mParentPos = NOT_A_DICT_POS; + mCodePointCount = 0; + mProbabilityFieldPos = NOT_A_DICT_POS; + mProbability = NOT_A_PROBABILITY; + mChildrenPosFieldPos = NOT_A_DICT_POS; + mChildrenPos = NOT_A_DICT_POS; + mBigramLinkedNodePos = NOT_A_DICT_POS; + mShortcutPos = NOT_A_DICT_POS; + mBigramPos = NOT_A_DICT_POS; + mSiblingPos = NOT_A_DICT_POS; +} + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h new file mode 100644 index 000000000..3b36d425f --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for helping to read nodes of dynamic patricia trie. This class handles moved + * node and reads node attributes. + */ +class DynamicPatriciaTrieNodeReader { + public: + DynamicPatriciaTrieNodeReader(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mBuffer(buffer), mBigramsPolicy(bigramsPolicy), + mShortcutsPolicy(shortcutsPolicy), mHeadPos(NOT_A_DICT_POS), mFlags(0), + mParentPos(NOT_A_DICT_POS), mCodePointCount(0), mProbabilityFieldPos(NOT_A_DICT_POS), + mProbability(NOT_A_PROBABILITY), mChildrenPosFieldPos(NOT_A_DICT_POS), + mChildrenPos(NOT_A_DICT_POS), mBigramLinkedNodePos(NOT_A_DICT_POS), + mShortcutPos(NOT_A_DICT_POS), mBigramPos(NOT_A_DICT_POS), + mSiblingPos(NOT_A_DICT_POS) {} + + ~DynamicPatriciaTrieNodeReader() {} + + // Reads PtNode information from dictionary buffer and updates members with the information. + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePos(const int ptNodePos) { + fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(ptNodePos , + 0 /* maxCodePointCount */, 0 /* outCodePoints */); + } + + AK_FORCE_INLINE void fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints) { + mSiblingPos = NOT_A_DICT_POS; + mBigramLinkedNodePos = NOT_A_DICT_POS; + fetchPtNodeInfoFromBufferAndProcessMovedPtNode(ptNodePos, maxCodePointCount, outCodePoints); + } + + // HeadPos is different from NodePos when the current PtNode is a moved PtNode. + AK_FORCE_INLINE int getHeadPos() const { + return mHeadPos; + } + + // Flags + AK_FORCE_INLINE bool isDeleted() const { + return DynamicPatriciaTrieReadingUtils::isDeleted(mFlags); + } + + AK_FORCE_INLINE bool hasChildren() const { + return mChildrenPos != NOT_A_DICT_POS; + } + + AK_FORCE_INLINE bool isTerminal() const { + return PatriciaTrieReadingUtils::isTerminal(mFlags); + } + + AK_FORCE_INLINE bool isBlacklisted() const { + return PatriciaTrieReadingUtils::isBlacklisted(mFlags); + } + + AK_FORCE_INLINE bool isNotAWord() const { + return PatriciaTrieReadingUtils::isNotAWord(mFlags); + } + + // Parent node position + AK_FORCE_INLINE int getParentPos() const { + return mParentPos; + } + + // Number of code points + AK_FORCE_INLINE uint8_t getCodePointCount() const { + return mCodePointCount; + } + + // Probability + AK_FORCE_INLINE int getProbabilityFieldPos() const { + return mProbabilityFieldPos; + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + // Children PtNode array position + AK_FORCE_INLINE int getChildrenPosFieldPos() const { + return mChildrenPosFieldPos; + } + + AK_FORCE_INLINE int getChildrenPos() const { + return mChildrenPos; + } + + // Bigram linked node position. + AK_FORCE_INLINE int getBigramLinkedNodePos() const { + return mBigramLinkedNodePos; + } + + // Shortcutlist position + AK_FORCE_INLINE int getShortcutPos() const { + return mShortcutPos; + } + + // Bigrams position + AK_FORCE_INLINE int getBigramsPos() const { + return mBigramPos; + } + + // Sibling node position + AK_FORCE_INLINE int getSiblingNodePos() const { + return mSiblingPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieNodeReader); + + const BufferWithExtendableBuffer *const mBuffer; + const DictionaryBigramsStructurePolicy *const mBigramsPolicy; + const DictionaryShortcutsStructurePolicy *const mShortcutsPolicy; + int mHeadPos; + DynamicPatriciaTrieReadingUtils::NodeFlags mFlags; + int mParentPos; + uint8_t mCodePointCount; + int mProbabilityFieldPos; + int mProbability; + int mChildrenPosFieldPos; + int mChildrenPos; + int mBigramLinkedNodePos; + int mShortcutPos; + int mBigramPos; + int mSiblingPos; + + void fetchPtNodeInfoFromBufferAndProcessMovedPtNode(const int ptNodePos, + const int maxCodePointCount, int *const outCodePoints); + + void invalidatePtNodeInfo(); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_NODE_READER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp new file mode 100644 index 000000000..42397c19e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.cpp @@ -0,0 +1,275 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +void DynamicPatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(dicNode->getChildrenPos()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + childDicNodes->pushLeavingChild(dicNode, nodeReader->getHeadPos(), + nodeReader->getChildrenPos(), nodeReader->getProbability(), + nodeReader->isTerminal() && !nodeReader->isDeleted(), + nodeReader->hasChildren(), nodeReader->isBlacklisted() || nodeReader->isNotAWord(), + nodeReader->getCodePointCount(), readingHelper.getMergedNodeCodePoints()); + readingHelper.readNextSiblingNode(); + } +} + +int DynamicPatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + // This method traverses parent nodes from the terminal by following parent pointers; thus, + // node code points are stored in the buffer in the reverse order. + int reverseCodePoints[maxCodePointCount]; + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + // First, read the terminal node and get its probability. + readingHelper.initWithPtNodePos(ptNodePos); + if (!readingHelper.isValidTerminalNode()) { + // Node at the ptNodePos is not a valid terminal node. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store terminal node probability. + *outUnigramProbability = readingHelper.getNodeReader()->getProbability(); + // Then, following parent node link to the dictionary root and fetch node code points. + while (!readingHelper.isEnd()) { + if (readingHelper.getTotalCodePointCount() > maxCodePointCount) { + // The ptNodePos is not a valid terminal node position in the dictionary. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Store node code points to buffer in the reverse order. + readingHelper.fetchMergedNodeCodePointsInReverseOrder( + readingHelper.getPrevTotalCodePointCount(), reverseCodePoints); + // Follow parent node toward the root node. + readingHelper.readParentNode(); + } + if (readingHelper.isError()) { + // The node position or the dictionary is invalid. + *outUnigramProbability = NOT_A_PROBABILITY; + return 0; + } + // Reverse the stored code points to output them. + const int codePointCount = readingHelper.getTotalCodePointCount(); + for (int i = 0; i < codePointCount; ++i) { + outCodePoints[i] = reverseCodePoints[codePointCount - i - 1]; + } + return codePointCount; +} + +int DynamicPatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + int searchCodePoints[length]; + for (int i = 0; i < length; ++i) { + searchCodePoints[i] = forceLowerCaseSearch ? CharUtils::toLowerCase(inWord[i]) : inWord[i]; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + while (!readingHelper.isEnd()) { + const int matchedCodePointCount = readingHelper.getPrevTotalCodePointCount(); + if (readingHelper.getTotalCodePointCount() > length + || !readingHelper.isMatchedCodePoint(0 /* index */, + searchCodePoints[matchedCodePointCount])) { + // Current node has too many code points or its first code point is different from + // target code point. Skip this node and read the next sibling node. + readingHelper.readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + if (!readingHelper.isMatchedCodePoint( + j, searchCodePoints[matchedCodePointCount + j])) { + // Different code point is found. The given word is not included in the dictionary. + return NOT_A_DICT_POS; + } + } + // All characters are matched. + if (length == readingHelper.getTotalCodePointCount()) { + // Terminal position is found. + return nodeReader->getHeadPos(); + } + if (!nodeReader->hasChildren()) { + return NOT_A_DICT_POS; + } + // Advance to the children nodes. + readingHelper.readChildNode(); + } + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + return NOT_A_DICT_POS; +} + +int DynamicPatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + // TODO: check mHeaderPolicy.usesForgettingCurve(); + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int DynamicPatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted() || nodeReader.isBlacklisted() || nodeReader.isNotAWord()) { + return NOT_A_PROBABILITY; + } + return getProbability(nodeReader.getProbability(), NOT_A_PROBABILITY); +} + +int DynamicPatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted()) { + return NOT_A_DICT_POS; + } + return nodeReader.getShortcutPos(); +} + +int DynamicPatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + DynamicPatriciaTrieNodeReader nodeReader(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + if (nodeReader.isDeleted()) { + return NOT_A_DICT_POS; + } + return nodeReader.getBigramsPos(); +} + +bool DynamicPatriciaTriePolicy::addUnigramWord(const int *const word, const int length, + const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + DynamicPatriciaTrieReadingHelper readingHelper(&mBufferWithExtendableBuffer, + getBigramsStructurePolicy(), getShortcutsStructurePolicy()); + readingHelper.initWithPtNodeArrayPos(getRootPosition()); + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addUnigramWord(&readingHelper, word, length, probability); +} + +bool DynamicPatriciaTriePolicy::addBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1, const int probability) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.addBigramWords(word0Pos, word1Pos, probability); +} + +bool DynamicPatriciaTriePolicy::removeBigramWords(const int *const word0, const int length0, + const int *const word1, const int length1) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } + const int word0Pos = getTerminalNodePositionOfWord(word0, length0, + false /* forceLowerCaseSearch */); + if (word0Pos == NOT_A_DICT_POS) { + return false; + } + const int word1Pos = getTerminalNodePositionOfWord(word1, length1, + false /* forceLowerCaseSearch */); + if (word1Pos == NOT_A_DICT_POS) { + return false; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + return writingHelper.removeBigramWords(word0Pos, word1Pos); +} + +void DynamicPatriciaTriePolicy::flush(const char *const filePath) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + return; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + writingHelper.writeToDictFile(filePath, &mHeaderPolicy); +} + +void DynamicPatriciaTriePolicy::flushWithGC(const char *const filePath) { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + return; + } + DynamicPatriciaTrieWritingHelper writingHelper(&mBufferWithExtendableBuffer, + &mBigramListPolicy, &mShortcutListPolicy); + writingHelper.writeToDictFileWithGC(getRootPosition(), filePath, &mHeaderPolicy); +} + +bool DynamicPatriciaTriePolicy::needsToRunGC() const { + if (!mBuffer->isUpdatable()) { + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + // TODO: Implement more properly. + return mBufferWithExtendableBuffer.isNearSizeLimit(); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h new file mode 100644 index 000000000..06d8095d8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_policy.h @@ -0,0 +1,104 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +class DynamicPatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + DynamicPatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), + mBufferWithExtendableBuffer(mBuffer->getBuffer() + mHeaderPolicy.getSize(), + mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mShortcutListPolicy(&mBufferWithExtendableBuffer), + mBigramListPolicy(&mBufferWithExtendableBuffer, &mShortcutListPolicy) {} + + ~DynamicPatriciaTriePolicy() { + delete mBuffer; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalPtNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability); + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability); + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1); + + void flush(const char *const filePath); + + void flushWithGC(const char *const filePath); + + bool needsToRunGC() const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTriePolicy); + + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + BufferWithExtendableBuffer mBufferWithExtendableBuffer; + DynamicShortcutListPolicy mShortcutListPolicy; + DynamicBigramListPolicy mBigramListPolicy; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp new file mode 100644 index 000000000..f4a2ef389 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// To avoid infinite loop caused by invalid or malicious forward links. +const int DynamicPatriciaTrieReadingHelper::MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const int DynamicPatriciaTrieReadingHelper::MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP = 100000; +const size_t DynamicPatriciaTrieReadingHelper::MAX_READING_STATE_STACK_SIZE = MAX_WORD_LENGTH; + +// Visits all PtNodes in post-order depth first manner. +// For example, visits c -> b -> y -> x -> a for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPostorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + while (!isEnd()) { + if (!alreadyVisitedChildren) { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + } else { + alreadyVisitedChildren = true; + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + // All PtNodes in current linked PtNode arrays have been visited. + // Return to the parent. + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + if (!listener->onAscend()) { + return false; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + } else { + // Process sibling PtNode. + alreadyVisitedChildren = false; + } + } + } + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Visits all PtNodes in PtNode array level pre-order depth first manner, which is the same order +// that PtNodes are written in the dictionary buffer. +// For example, visits a -> b -> x -> c -> y for the following dictionary: +// a _ b _ c +// \ x _ y +bool DynamicPatriciaTrieReadingHelper::traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener) { + bool alreadyVisitedAllPtNodesInArray = false; + bool alreadyVisitedChildren = false; + // Descend from the root to the root PtNode array. + if (!listener->onDescend(getPosOfLastPtNodeArrayHead())) { + return false; + } + pushReadingStateToStack(); + while (!isEnd()) { + if (alreadyVisitedAllPtNodesInArray) { + if (alreadyVisitedChildren) { + // Move to next sibling PtNode's children. + readNextSiblingNode(); + if (isEnd()) { + // Return to the parent PTNode. + if (!listener->onAscend()) { + return false; + } + if (mReadingStateStack.size() <= 0) { + break; + } + popReadingStateFromStack(); + alreadyVisitedChildren = true; + alreadyVisitedAllPtNodesInArray = true; + } else { + alreadyVisitedChildren = false; + } + } else { + if (mNodeReader.hasChildren()) { + // Move to the first child. + if (!listener->onDescend(mNodeReader.getChildrenPos())) { + return false; + } + pushReadingStateToStack(); + readChildNode(); + // Push state to return the head of PtNode array. + pushReadingStateToStack(); + alreadyVisitedAllPtNodesInArray = false; + alreadyVisitedChildren = false; + } else { + alreadyVisitedChildren = true; + } + } + } else { + if (!listener->onVisitingPtNode(&mNodeReader, mMergedNodeCodePoints)) { + return false; + } + readNextSiblingNode(); + if (isEnd()) { + if (!listener->onReadingPtNodeArrayTail()) { + return false; + } + // Return to the head of current PtNode array. + popReadingStateFromStack(); + alreadyVisitedAllPtNodesInArray = true; + } + } + } + popReadingStateFromStack(); + // Ascend from the root PtNode array to the root. + if (!listener->onAscend()) { + return false; + } + return !isError(); +} + +// Read node array size and process empty node arrays. Nodes and arrays are counted up in this +// method to avoid an infinite loop. +void DynamicPatriciaTrieReadingHelper::nextPtNodeArray() { + mReadingState.mPosOfLastPtNodeArrayHead = mReadingState.mPos; + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); + } + mReadingState.mNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + dictBuf, &mReadingState.mPos); + if (usesAdditionalBuffer) { + mReadingState.mPos += mBuffer->getOriginalBufferSize(); + } + // Count up nodes and node arrays to avoid infinite loop. + mReadingState.mTotalNodeCount += mReadingState.mNodeCount; + mReadingState.mNodeArrayCount++; + if (mReadingState.mNodeCount < 0 + || mReadingState.mTotalNodeCount > MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP + || mReadingState.mNodeArrayCount > MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP) { + // Invalid dictionary. + AKLOGI("Invalid dictionary. nodeCount: %d, totalNodeCount: %d, MAX_CHILD_COUNT: %d" + "nodeArrayCount: %d, MAX_NODE_ARRAY_COUNT: %d", + mReadingState.mNodeCount, mReadingState.mTotalNodeCount, + MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP, mReadingState.mNodeArrayCount, + MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + return; + } + if (mReadingState.mNodeCount == 0) { + // Empty node array. Try following forward link. + followForwardLink(); + } +} + +// Follow the forward link and read the next node array if exists. +void DynamicPatriciaTrieReadingHelper::followForwardLink() { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(mReadingState.mPos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + mReadingState.mPos -= mBuffer->getOriginalBufferSize(); + } + const int forwardLinkPosition = + DynamicPatriciaTrieReadingUtils::getForwardLinkPosition(dictBuf, mReadingState.mPos); + if (usesAdditionalBuffer) { + mReadingState.mPos += mBuffer->getOriginalBufferSize(); + } + mReadingState.mPosOfLastForwardLinkField = mReadingState.mPos; + if (DynamicPatriciaTrieReadingUtils::isValidForwardLinkPosition(forwardLinkPosition)) { + // Follow the forward link. + mReadingState.mPos += forwardLinkPosition; + nextPtNodeArray(); + } else { + // All node arrays have been read. + mReadingState.mPos = NOT_A_DICT_POS; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h new file mode 100644 index 000000000..c6d8ddcf7 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h @@ -0,0 +1,286 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H + +#include <cstddef> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DictionaryBigramsStructurePolicy; +class DictionaryShortcutsStructurePolicy; + +/* + * This class is used for traversing dynamic patricia trie. This class supports iterating nodes and + * dealing with additional buffer. This class counts nodes and node arrays to avoid infinite loop. + */ +class DynamicPatriciaTrieReadingHelper { + public: + class TraversingEventListener { + public: + virtual ~TraversingEventListener() {}; + + // Returns whether the event handling was succeeded or not. + virtual bool onAscend() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onDescend(const int ptNodeArrayPos) = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onReadingPtNodeArrayTail() = 0; + + // Returns whether the event handling was succeeded or not. + virtual bool onVisitingPtNode(const DynamicPatriciaTrieNodeReader *const node, + const int *const nodeCodePoints) = 0; + + protected: + TraversingEventListener() {}; + + private: + DISALLOW_COPY_AND_ASSIGN(TraversingEventListener); + }; + + DynamicPatriciaTrieReadingHelper(const BufferWithExtendableBuffer *const buffer, + const DictionaryBigramsStructurePolicy *const bigramsPolicy, + const DictionaryShortcutsStructurePolicy *const shortcutsPolicy) + : mIsError(false), mReadingState(), mBuffer(buffer), + mNodeReader(mBuffer, bigramsPolicy, shortcutsPolicy), mReadingStateStack() {} + + ~DynamicPatriciaTrieReadingHelper() {} + + AK_FORCE_INLINE bool isError() const { + return mIsError; + } + + AK_FORCE_INLINE bool isEnd() const { + return mReadingState.mPos == NOT_A_DICT_POS; + } + + // Initialize reading state with the head position of a PtNode array. + AK_FORCE_INLINE void initWithPtNodeArrayPos(const int ptNodeArrayPos) { + if (ptNodeArrayPos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodeArrayPos; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingStateStack.clear(); + nextPtNodeArray(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } + } + + // Initialize reading state with the head position of a node. + AK_FORCE_INLINE void initWithPtNodePos(const int ptNodePos) { + if (ptNodePos == NOT_A_DICT_POS) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mIsError = false; + mReadingState.mPos = ptNodePos; + mReadingState.mNodeCount = 1; + mReadingState.mPrevTotalCodePointCount = 0; + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + mReadingStateStack.clear(); + fetchPtNodeInfo(); + } + } + + AK_FORCE_INLINE const DynamicPatriciaTrieNodeReader* getNodeReader() const { + return &mNodeReader; + } + + AK_FORCE_INLINE bool isValidTerminalNode() const { + return !isEnd() && !mNodeReader.isDeleted() && mNodeReader.isTerminal(); + } + + AK_FORCE_INLINE bool isMatchedCodePoint(const int index, const int codePoint) const { + return mMergedNodeCodePoints[index] == codePoint; + } + + // Return code point count exclude the last read node's code points. + AK_FORCE_INLINE int getPrevTotalCodePointCount() const { + return mReadingState.mPrevTotalCodePointCount; + } + + // Return code point count include the last read node's code points. + AK_FORCE_INLINE int getTotalCodePointCount() const { + return mReadingState.mPrevTotalCodePointCount + mNodeReader.getCodePointCount(); + } + + AK_FORCE_INLINE void fetchMergedNodeCodePointsInReverseOrder( + const int index, int *const outCodePoints) const { + const int nodeCodePointCount = mNodeReader.getCodePointCount(); + for (int i = 0; i < nodeCodePointCount; ++i) { + outCodePoints[index + i] = mMergedNodeCodePoints[nodeCodePointCount - 1 - i]; + } + } + + AK_FORCE_INLINE const int *getMergedNodeCodePoints() const { + return mMergedNodeCodePoints; + } + + AK_FORCE_INLINE void readNextSiblingNode() { + mReadingState.mNodeCount -= 1; + mReadingState.mPos = mNodeReader.getSiblingNodePos(); + if (mReadingState.mNodeCount <= 0) { + // All nodes in the current node array have been read. + followForwardLink(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } else { + fetchPtNodeInfo(); + } + } + + // Read the first child node of the current node. + AK_FORCE_INLINE void readChildNode() { + if (mNodeReader.hasChildren()) { + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 0; + mReadingState.mNodeArrayCount = 0; + mReadingState.mPos = mNodeReader.getChildrenPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + // Read children node array. + nextPtNodeArray(); + if (!isEnd()) { + fetchPtNodeInfo(); + } + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + // Read the parent node of the current node. + AK_FORCE_INLINE void readParentNode() { + if (mNodeReader.getParentPos() != NOT_A_DICT_POS) { + mReadingState.mPrevTotalCodePointCount += mNodeReader.getCodePointCount(); + mReadingState.mTotalNodeCount = 1; + mReadingState.mNodeArrayCount = 1; + mReadingState.mNodeCount = 1; + mReadingState.mPos = mNodeReader.getParentPos(); + mReadingState.mPosOfLastForwardLinkField = NOT_A_DICT_POS; + mReadingState.mPosOfLastPtNodeArrayHead = NOT_A_DICT_POS; + fetchPtNodeInfo(); + } else { + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE int getPosOfLastForwardLinkField() const { + return mReadingState.mPosOfLastForwardLinkField; + } + + AK_FORCE_INLINE int getPosOfLastPtNodeArrayHead() const { + return mReadingState.mPosOfLastPtNodeArrayHead; + } + + AK_FORCE_INLINE void reloadCurrentPtNodeInfo() { + if (!isEnd()) { + fetchPtNodeInfo(); + } + } + + bool traverseAllPtNodesInPostorderDepthFirstManner(TraversingEventListener *const listener); + + bool traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + TraversingEventListener *const listener); + + private: + DISALLOW_COPY_AND_ASSIGN(DynamicPatriciaTrieReadingHelper); + + class ReadingState { + public: + // Note that copy constructor and assignment operator are used for this class to use + // std::vector. + ReadingState() : mPos(NOT_A_DICT_POS), mNodeCount(0), mPrevTotalCodePointCount(0), + mTotalNodeCount(0), mNodeArrayCount(0), mPosOfLastForwardLinkField(NOT_A_DICT_POS), + mPosOfLastPtNodeArrayHead(NOT_A_DICT_POS) {} + + int mPos; + // Node count of a node array. + int mNodeCount; + int mPrevTotalCodePointCount; + int mTotalNodeCount; + int mNodeArrayCount; + int mPosOfLastForwardLinkField; + int mPosOfLastPtNodeArrayHead; + }; + + static const int MAX_CHILD_COUNT_TO_AVOID_INFINITE_LOOP; + static const int MAX_NODE_ARRAY_COUNT_TO_AVOID_INFINITE_LOOP; + static const size_t MAX_READING_STATE_STACK_SIZE; + + bool mIsError; + ReadingState mReadingState; + const BufferWithExtendableBuffer *const mBuffer; + DynamicPatriciaTrieNodeReader mNodeReader; + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + std::vector<ReadingState> mReadingStateStack; + + void nextPtNodeArray(); + + void followForwardLink(); + + AK_FORCE_INLINE void fetchPtNodeInfo() { + mNodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(mReadingState.mPos, + MAX_WORD_LENGTH, mMergedNodeCodePoints); + if (mNodeReader.getCodePointCount() <= 0) { + // Empty node is not allowed. + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } + } + + AK_FORCE_INLINE void pushReadingStateToStack() { + if (mReadingStateStack.size() > MAX_READING_STATE_STACK_SIZE) { + AKLOGI("Reading state stack overflow. Max size: %zd", MAX_READING_STATE_STACK_SIZE); + ASSERT(false); + mIsError = true; + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingStateStack.push_back(mReadingState); + } + } + + AK_FORCE_INLINE void popReadingStateFromStack() { + if (mReadingStateStack.empty()) { + mReadingState.mPos = NOT_A_DICT_POS; + } else { + mReadingState = mReadingStateStack.back(); + mReadingStateStack.pop_back(); + fetchPtNodeInfo(); + } + } +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..d68446db6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef DynamicPatriciaTrieReadingUtils DptReadingUtils; + +const DptReadingUtils::NodeFlags DptReadingUtils::MASK_MOVED = 0xC0; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_NOT_MOVED = 0xC0; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_MOVED = 0x40; +const DptReadingUtils::NodeFlags DptReadingUtils::FLAG_IS_DELETED = 0x80; + +// TODO: Make DICT_OFFSET_ZERO_OFFSET = 0. +// Currently, DICT_OFFSET_INVALID is 0 in Java side but offset can be 0 during GC. So, the maximum +// value of offsets, which is 0x7FFFFF is used to represent 0 offset. +const int DptReadingUtils::DICT_OFFSET_INVALID = 0; +const int DptReadingUtils::DICT_OFFSET_ZERO_OFFSET = 0x7FFFFF; + +/* static */ int DptReadingUtils::getForwardLinkPosition(const uint8_t *const buffer, + const int pos) { + int linkAddressPos = pos; + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, &linkAddressPos); +} + +/* static */ int DptReadingUtils::getParentPtNodePosOffsetAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); +} + +/* static */ int DptReadingUtils::getParentPtNodePos(const int parentOffset, const int ptNodePos) { + if (parentOffset == DICT_OFFSET_INVALID) { + return NOT_A_DICT_POS; + } else if (parentOffset == DICT_OFFSET_ZERO_OFFSET) { + return ptNodePos; + } else { + return parentOffset + ptNodePos; + } +} + +/* static */ int DptReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const int base = *pos; + const int offset = ByteArrayUtils::readSint24AndAdvancePosition(buffer, pos); + if (offset == DICT_OFFSET_INVALID) { + // The PtNode does not have children. + return NOT_A_DICT_POS; + } else if (offset == DICT_OFFSET_ZERO_OFFSET) { + return base; + } else { + return base + offset; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h new file mode 100644 index 000000000..67c3cc57e --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class DynamicPatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static const int DICT_OFFSET_INVALID; + static const int DICT_OFFSET_ZERO_OFFSET; + + static int getForwardLinkPosition(const uint8_t *const buffer, const int pos); + + static AK_FORCE_INLINE bool isValidForwardLinkPosition(const int forwardLinkAddress) { + return forwardLinkAddress != 0; + } + + static int getParentPtNodePosOffsetAndAdvancePosition(const uint8_t *const buffer, + int *const pos); + + static int getParentPtNodePos(const int parentOffset, const int ptNodePos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isMoved(const NodeFlags flags) { + return FLAG_IS_MOVED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE bool isDeleted(const NodeFlags flags) { + return FLAG_IS_DELETED == (MASK_MOVED & flags); + } + + static AK_FORCE_INLINE NodeFlags updateAndGetFlags(const NodeFlags originalFlags, + const bool isMoved, const bool isDeleted) { + NodeFlags flags = originalFlags; + flags = isMoved ? ((flags & (~MASK_MOVED)) | FLAG_IS_MOVED) : flags; + flags = isDeleted ? ((flags & (~MASK_MOVED)) | FLAG_IS_DELETED) : flags; + flags = (!isMoved && !isDeleted) ? ((flags & (~MASK_MOVED)) | FLAG_IS_NOT_MOVED) : flags; + return flags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieReadingUtils); + + static const NodeFlags MASK_MOVED; + static const NodeFlags FLAG_IS_NOT_MOVED; + static const NodeFlags FLAG_IS_MOVED; + static const NodeFlags FLAG_IS_DELETED; +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp new file mode 100644 index 000000000..578645cd5 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.cpp @@ -0,0 +1,511 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h" + +#include "suggest/policyimpl/dictionary/bigram/dynamic_bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_gc_event_listeners.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_node_reader.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_helper.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +const int DynamicPatriciaTrieWritingHelper::CHILDREN_POSITION_FIELD_SIZE = 3; +// TODO: Make MAX_DICTIONARY_SIZE 8MB. +const size_t DynamicPatriciaTrieWritingHelper::MAX_DICTIONARY_SIZE = 2 * 1024 * 1024; + +bool DynamicPatriciaTrieWritingHelper::addUnigramWord( + DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability) { + int parentPos = NOT_A_DICT_POS; + while (!readingHelper->isEnd()) { + const int matchedCodePointCount = readingHelper->getPrevTotalCodePointCount(); + if (!readingHelper->isMatchedCodePoint(0 /* index */, + wordCodePoints[matchedCodePointCount])) { + // The first code point is different from target code point. Skip this node and read + // the next sibling node. + readingHelper->readNextSiblingNode(); + continue; + } + // Check following merged node code points. + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper->getNodeReader(); + const int nodeCodePointCount = nodeReader->getCodePointCount(); + for (int j = 1; j < nodeCodePointCount; ++j) { + const int nextIndex = matchedCodePointCount + j; + if (nextIndex >= codePointCount || !readingHelper->isMatchedCodePoint(j, + wordCodePoints[matchedCodePointCount + j])) { + return reallocatePtNodeAndAddNewPtNodes(nodeReader, + readingHelper->getMergedNodeCodePoints(), j, probability, + wordCodePoints + matchedCodePointCount, + codePointCount - matchedCodePointCount); + } + } + // All characters are matched. + if (codePointCount == readingHelper->getTotalCodePointCount()) { + return setPtNodeProbability(nodeReader, probability, + readingHelper->getMergedNodeCodePoints()); + } + if (!nodeReader->hasChildren()) { + return createChildrenPtNodeArrayAndAChildPtNode(nodeReader, probability, + wordCodePoints + readingHelper->getTotalCodePointCount(), + codePointCount - readingHelper->getTotalCodePointCount()); + } + // Advance to the children nodes. + parentPos = nodeReader->getHeadPos(); + readingHelper->readChildNode(); + } + if (readingHelper->isError()) { + // The dictionary is invalid. + return false; + } + int pos = readingHelper->getPosOfLastForwardLinkField(); + return createAndInsertNodeIntoPtNodeArray(parentPos, + wordCodePoints + readingHelper->getPrevTotalCodePointCount(), + codePointCount - readingHelper->getPrevTotalCodePointCount(), + probability, &pos); +} + +bool DynamicPatriciaTrieWritingHelper::addBigramWords(const int word0Pos, const int word1Pos, + const int probability) { + int mMergedNodeCodePoints[MAX_WORD_LENGTH]; + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePosAndGetNodeCodePoints(word0Pos, MAX_WORD_LENGTH, + mMergedNodeCodePoints); + // Move node to add bigram entry. + const int newNodePos = mBuffer->getTailPosition(); + if (!markNodeAsMovedAndSetPosition(&nodeReader, newNodePos, newNodePos)) { + return false; + } + int writingPos = newNodePos; + // Write a new PtNode using original PtNode's info to the tail of the dictionary in mBuffer. + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, &nodeReader, nodeReader.getParentPos(), + mMergedNodeCodePoints, nodeReader.getCodePointCount(), nodeReader.getProbability(), + &writingPos)) { + return false; + } + nodeReader.fetchNodeInfoInBufferFromPtNodePos(newNodePos); + if (nodeReader.getBigramsPos() != NOT_A_DICT_POS) { + // Insert a new bigram entry into the existing bigram list. + int bigramListPos = nodeReader.getBigramsPos(); + return mBigramPolicy->addNewBigramEntryToBigramList(word1Pos, probability, &bigramListPos); + } else { + // The PtNode doesn't have a bigram list. + // First, Write a bigram entry at the tail position of the PtNode. + if (!mBigramPolicy->writeNewBigramEntry(word1Pos, probability, &writingPos)) { + return false; + } + // Then, Mark as the PtNode having bigram list in the flags. + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + PatriciaTrieReadingUtils::createAndGetFlags(nodeReader.isBlacklisted(), + nodeReader.isNotAWord(), nodeReader.getProbability() != NOT_A_PROBABILITY, + nodeReader.getShortcutPos() != NOT_A_DICT_POS, true /* hasBigrams */, + nodeReader.getCodePointCount() > 1, CHILDREN_POSITION_FIELD_SIZE); + writingPos = newNodePos; + // Write updated flags into the moved PtNode's flags field. + return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos); + } +} + +// Remove a bigram relation from word0Pos to word1Pos. +bool DynamicPatriciaTrieWritingHelper::removeBigramWords(const int word0Pos, const int word1Pos) { + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(word0Pos); + if (nodeReader.getBigramsPos() == NOT_A_DICT_POS) { + return false; + } + return mBigramPolicy->removeBigram(nodeReader.getBigramsPos(), word1Pos); +} + +void DynamicPatriciaTrieWritingHelper::writeToDictFile(const char *const fileName, + const HeaderPolicy *const headerPolicy) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, false /* updatesLastUpdatedTime */)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, mBuffer); +} + +void DynamicPatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeArrayPos, + const char *const fileName, const HeaderPolicy *const headerPolicy) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!headerPolicy->writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */)) { + return; + } + BufferWithExtendableBuffer newDictBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */, + MAX_DICTIONARY_SIZE); + if (!runGC(rootPtNodeArrayPos, &newDictBuffer)) { + return; + } + DictFileWritingUtils::flushAllHeaderAndBodyToFile(fileName, &headerBuffer, &newDictBuffer); +} + +bool DynamicPatriciaTrieWritingHelper::markNodeAsDeleted( + const DynamicPatriciaTrieNodeReader *const nodeToUpdate) { + int pos = nodeToUpdate->getHeadPos(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, false /* isMoved */, + true /* isDeleted */); + int writingPos = nodeToUpdate->getHeadPos(); + // Update flags. + return DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::markNodeAsMovedAndSetPosition( + const DynamicPatriciaTrieNodeReader *const originalNode, const int movedPos, + const int bigramLinkedNodePos) { + int pos = originalNode->getHeadPos(); + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(pos); + const uint8_t *const dictBuf = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + pos -= mBuffer->getOriginalBufferSize(); + } + // Read original flags + const PatriciaTrieReadingUtils::NodeFlags originalFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(dictBuf, &pos); + const PatriciaTrieReadingUtils::NodeFlags updatedFlags = + DynamicPatriciaTrieReadingUtils::updateAndGetFlags(originalFlags, true /* isMoved */, + false /* isDeleted */); + int writingPos = originalNode->getHeadPos(); + // Update flags. + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(mBuffer, updatedFlags, + &writingPos)) { + return false; + } + // Update moved position, which is stored in the parent offset field. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, movedPos, originalNode->getHeadPos(), &writingPos)) { + return false; + } + // Update bigram linked node position, which is stored in the children position field. + int childrenPosFieldPos = originalNode->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + mBuffer, bigramLinkedNodePos, &childrenPosFieldPos)) { + return false; + } + if (originalNode->hasChildren()) { + // Update children's parent position. + DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); + const DynamicPatriciaTrieNodeReader *const nodeReader = readingHelper.getNodeReader(); + readingHelper.initWithPtNodeArrayPos(originalNode->getChildrenPos()); + while (!readingHelper.isEnd()) { + int parentOffsetFieldPos = nodeReader->getHeadPos() + + DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE; + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + mBuffer, movedPos, nodeReader->getHeadPos(), &parentOffsetFieldPos)) { + // Parent offset cannot be written because of a bug or a broken dictionary; thus, + // we give up to update dictionary. + return false; + } + readingHelper.readNextSiblingNode(); + } + } + return true; +} + +// Write new PtNode at writingPos. +bool DynamicPatriciaTrieWritingHelper::writePtNodeWithFullInfoToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const bool isBlacklisted, + const bool isNotAWord, const int parentPos, const int *const codePoints, + const int codePointCount, const int probability, const int childrenPos, + const int originalBigramListPos, const int originalShortcutListPos, + int *const writingPos) { + const int nodePos = *writingPos; + // Write dummy flags. The Node flags are updated with appropriate flags at the last step of the + // PtNode writing. + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, + 0 /* nodeFlags */, writingPos)) { + return false; + } + // Calculate a parent offset and write the offset. + if (!DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition(bufferToWrite, + parentPos, nodePos, writingPos)) { + return false; + } + // Write code points + if (!DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition(bufferToWrite, + codePoints, codePointCount, writingPos)) { + return false; + } + // Write probability when the probability is a valid probability, which means this node is + // terminal. + if (probability != NOT_A_PROBABILITY) { + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(bufferToWrite, + probability, writingPos)) { + return false; + } + } + // Write children position + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(bufferToWrite, + childrenPos, writingPos)) { + return false; + } + // Copy shortcut list when the originalShortcutListPos is valid dictionary position. + if (originalShortcutListPos != NOT_A_DICT_POS) { + int fromPos = originalShortcutListPos; + if (!mShortcutPolicy->copyAllShortcutsAndReturnIfSucceededOrNot(bufferToWrite, &fromPos, + writingPos)) { + return false; + } + } + // Copy bigram list when the originalBigramListPos is valid dictionary position. + int bigramCount = 0; + if (originalBigramListPos != NOT_A_DICT_POS) { + int fromPos = originalBigramListPos; + if (!mBigramPolicy->copyAllBigrams(bufferToWrite, &fromPos, writingPos, &bigramCount)) { + return false; + } + } + // Create node flags and write them. + PatriciaTrieReadingUtils::NodeFlags nodeFlags = + PatriciaTrieReadingUtils::createAndGetFlags(isBlacklisted, isNotAWord, + probability != NOT_A_PROBABILITY /* isTerminal */, + originalShortcutListPos != NOT_A_DICT_POS /* hasShortcutTargets */, + bigramCount > 0 /* hasBigrams */, codePointCount > 1 /* hasMultipleChars */, + CHILDREN_POSITION_FIELD_SIZE); + int flagsFieldPos = nodePos; + if (!DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition(bufferToWrite, nodeFlags, + &flagsFieldPos)) { + return false; + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::writePtNodeToBuffer( + BufferWithExtendableBuffer *const bufferToWrite, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos) { + return writePtNodeWithFullInfoToBuffer(bufferToWrite, false /* isBlacklisted */, + false /* isNotAWord */, parentPos, codePoints, codePointCount, probability, + NOT_A_DICT_POS /* childrenPos */, NOT_A_DICT_POS /* originalBigramsPos */, + NOT_A_DICT_POS /* originalShortcutPos */, writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::writePtNodeToBufferByCopyingPtNodeInfo( + BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos) { + return writePtNodeWithFullInfoToBuffer(bufferToWrite, originalNode->isBlacklisted(), + originalNode->isNotAWord(), parentPos, codePoints, codePointCount, probability, + originalNode->getChildrenPos(), originalNode->getBigramsPos(), + originalNode->getShortcutPos(), writingPos); +} + +bool DynamicPatriciaTrieWritingHelper::createAndInsertNodeIntoPtNodeArray(const int parentPos, + const int *const nodeCodePoints, const int nodeCodePointCount, const int probability, + int *const forwardLinkFieldPos) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, forwardLinkFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentPos, nodeCodePoints, nodeCodePointCount, + probability); +} + +bool DynamicPatriciaTrieWritingHelper::setPtNodeProbability( + const DynamicPatriciaTrieNodeReader *const originalPtNode, const int probability, + const int *const codePoints) { + if (originalPtNode->isTerminal()) { + // Overwrites the probability. + int probabilityFieldPos = originalPtNode->getProbabilityFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition(mBuffer, + probability, &probabilityFieldPos)) { + return false; + } + } else { + // Make the node terminal and write the probability. + int movedPos = mBuffer->getTailPosition(); + if (!markNodeAsMovedAndSetPosition(originalPtNode, movedPos, movedPos)) { + return false; + } + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, originalPtNode, + originalPtNode->getParentPos(), codePoints, originalPtNode->getCodePointCount(), + probability, &movedPos)) { + return false; + } + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::createChildrenPtNodeArrayAndAChildPtNode( + const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, + const int *const codePoints, const int codePointCount) { + const int newPtNodeArrayPos = mBuffer->getTailPosition(); + int childrenPosFieldPos = parentNode->getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, + newPtNodeArrayPos, &childrenPosFieldPos)) { + return false; + } + return createNewPtNodeArrayWithAChildPtNode(parentNode->getHeadPos(), codePoints, + codePointCount, probability); +} + +bool DynamicPatriciaTrieWritingHelper::createNewPtNodeArrayWithAChildPtNode( + const int parentPtNodePos, const int *const nodeCodePoints, const int nodeCodePointCount, + const int probability) { + int writingPos = mBuffer->getTailPosition(); + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + 1 /* arraySize */, &writingPos)) { + return false; + } + if (!writePtNodeToBuffer(mBuffer, parentPtNodePos, nodeCodePoints, nodeCodePointCount, + probability, &writingPos)) { + return false; + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + return true; +} + +// Returns whether the dictionary updating was succeeded or not. +bool DynamicPatriciaTrieWritingHelper::reallocatePtNodeAndAddNewPtNodes( + const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, + const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount) { + // When addsExtraChild is true, split the reallocating PtNode and add new child. + // Reallocating PtNode: abcde, newNode: abcxy. + // abc (1st, not terminal) __ de (2nd) + // \_ xy (extra child, terminal) + // Otherwise, this method makes 1st part terminal and write probabilityOfNewPtNode. + // Reallocating PtNode: abcde, newNode: abc. + // abc (1st, terminal) __ de (2nd) + const bool addsExtraChild = newNodeCodePointCount > overlappingCodePointCount; + const int firstPartOfReallocatedPtNodePos = mBuffer->getTailPosition(); + int writingPos = firstPartOfReallocatedPtNodePos; + // Write the 1st part of the reallocating node. The children position will be updated later + // with actual children position. + const int newProbability = addsExtraChild ? NOT_A_PROBABILITY : probabilityOfNewPtNode; + if (!writePtNodeToBuffer(mBuffer, reallocatingPtNode->getParentPos(), + reallocatingPtNodeCodePoints, overlappingCodePointCount, newProbability, + &writingPos)) { + return false; + } + const int actualChildrenPos = writingPos; + // Create new children PtNode array. + const size_t newPtNodeCount = addsExtraChild ? 2 : 1; + if (!DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition(mBuffer, + newPtNodeCount, &writingPos)) { + return false; + } + // Write the 2nd part of the reallocating node. + const int secondPartOfReallocatedPtNodePos = writingPos; + if (!writePtNodeToBufferByCopyingPtNodeInfo(mBuffer, reallocatingPtNode, + firstPartOfReallocatedPtNodePos, + reallocatingPtNodeCodePoints + overlappingCodePointCount, + reallocatingPtNode->getCodePointCount() - overlappingCodePointCount, + reallocatingPtNode->getProbability(), &writingPos)) { + return false; + } + if (addsExtraChild) { + if (!writePtNodeToBuffer(mBuffer, firstPartOfReallocatedPtNodePos, + newNodeCodePoints + overlappingCodePointCount, + newNodeCodePointCount - overlappingCodePointCount, probabilityOfNewPtNode, + &writingPos)) { + return false; + } + } + if (!DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition(mBuffer, + NOT_A_DICT_POS /* forwardLinkPos */, &writingPos)) { + return false; + } + // Update original reallocatingPtNode as moved. + if (!markNodeAsMovedAndSetPosition(reallocatingPtNode, firstPartOfReallocatedPtNodePos, + secondPartOfReallocatedPtNodePos)) { + return false; + } + // Load node info. Information of the 1st part will be fetched. + DynamicPatriciaTrieNodeReader nodeReader(mBuffer, mBigramPolicy, mShortcutPolicy); + nodeReader.fetchNodeInfoInBufferFromPtNodePos(firstPartOfReallocatedPtNodePos); + // Update children position. + int childrenPosFieldPos = nodeReader.getChildrenPosFieldPos(); + if (!DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition(mBuffer, + actualChildrenPos, &childrenPosFieldPos)) { + return false; + } + return true; +} + +bool DynamicPatriciaTrieWritingHelper::runGC(const int rootPtNodeArrayPos, + BufferWithExtendableBuffer *const bufferToWrite) { + DynamicPatriciaTrieReadingHelper readingHelper(mBuffer, mBigramPolicy, mShortcutPolicy); + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners + ::TraversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted + traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted( + this, mBuffer); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateUnigramProbabilityAndMarkUselessPtNodesAsDeleted)) { + return false; + } + + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateBigramProbability + traversePolicyToUpdateBigramProbability(mBigramPolicy); + if (!readingHelper.traverseAllPtNodesInPostorderDepthFirstManner( + &traversePolicyToUpdateBigramProbability)) { + return false; + } + + // Mapping from positions in mBuffer to positions in bufferToWrite. + DictPositionRelocationMap dictPositionRelocationMap; + readingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToPlaceAndWriteValidPtNodesToBuffer + traversePolicyToPlaceAndWriteValidPtNodesToBuffer(this, bufferToWrite, + &dictPositionRelocationMap); + if (!readingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToPlaceAndWriteValidPtNodesToBuffer)) { + return false; + } + + // Create policy instance for the GCed dictionary. + DynamicShortcutListPolicy newDictShortcutPolicy(bufferToWrite); + DynamicBigramListPolicy newDictBigramPolicy(bufferToWrite, &newDictShortcutPolicy); + // Create reading helper for the GCed dictionary. + DynamicPatriciaTrieReadingHelper newDictReadingHelper(bufferToWrite, &newDictBigramPolicy, + &newDictShortcutPolicy); + newDictReadingHelper.initWithPtNodeArrayPos(rootPtNodeArrayPos); + DynamicPatriciaTrieGcEventListeners::TraversePolicyToUpdateAllPositionFields + traversePolicyToUpdateAllPositionFields(this, &newDictBigramPolicy, bufferToWrite, + &dictPositionRelocationMap); + if (!newDictReadingHelper.traverseAllPtNodesInPtNodeArrayLevelPreorderDepthFirstManner( + &traversePolicyToUpdateAllPositionFields)) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h new file mode 100644 index 000000000..fe1b2437a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_helper.h @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H + +#include <stdint.h> + +#include "defines.h" +#include "utils/hash_map_compat.h" + +namespace latinime { + +class BufferWithExtendableBuffer; +class DynamicBigramListPolicy; +class DynamicPatriciaTrieNodeReader; +class DynamicPatriciaTrieReadingHelper; +class DynamicShortcutListPolicy; +class HeaderPolicy; + +class DynamicPatriciaTrieWritingHelper { + public: + typedef hash_map_compat<int, int> PtNodeArrayPositionRelocationMap; + typedef hash_map_compat<int, int> PtNodePositionRelocationMap; + struct DictPositionRelocationMap { + public: + DictPositionRelocationMap() + : mPtNodeArrayPositionRelocationMap(), mPtNodePositionRelocationMap() {} + + PtNodeArrayPositionRelocationMap mPtNodeArrayPositionRelocationMap; + PtNodePositionRelocationMap mPtNodePositionRelocationMap; + + private: + DISALLOW_COPY_AND_ASSIGN(DictPositionRelocationMap); + }; + + DynamicPatriciaTrieWritingHelper(BufferWithExtendableBuffer *const buffer, + DynamicBigramListPolicy *const bigramPolicy, + DynamicShortcutListPolicy *const shortcutPolicy) + : mBuffer(buffer), mBigramPolicy(bigramPolicy), mShortcutPolicy(shortcutPolicy) {} + + ~DynamicPatriciaTrieWritingHelper() {} + + // Add a word to the dictionary. If the word already exists, update the probability. + bool addUnigramWord(DynamicPatriciaTrieReadingHelper *const readingHelper, + const int *const wordCodePoints, const int codePointCount, const int probability); + + // Add a bigram relation from word0Pos to word1Pos. + bool addBigramWords(const int word0Pos, const int word1Pos, const int probability); + + // Remove a bigram relation from word0Pos to word1Pos. + bool removeBigramWords(const int word0Pos, const int word1Pos); + + void writeToDictFile(const char *const fileName, const HeaderPolicy *const headerPolicy); + + void writeToDictFileWithGC(const int rootPtNodeArrayPos, const char *const fileName, + const HeaderPolicy *const headerPolicy); + + // CAVEAT: This method must be called only from inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool markNodeAsDeleted(const DynamicPatriciaTrieNodeReader *const nodeToUpdate); + + // CAVEAT: This method must be called only from this class or inner classes of + // DynamicPatriciaTrieGcEventListeners. + bool writePtNodeToBufferByCopyingPtNodeInfo(BufferWithExtendableBuffer *const bufferToWrite, + const DynamicPatriciaTrieNodeReader *const originalNode, const int parentPos, + const int *const codePoints, const int codePointCount, const int probability, + int *const writingPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingHelper); + + static const int CHILDREN_POSITION_FIELD_SIZE; + static const size_t MAX_DICTIONARY_SIZE; + + BufferWithExtendableBuffer *const mBuffer; + DynamicBigramListPolicy *const mBigramPolicy; + DynamicShortcutListPolicy *const mShortcutPolicy; + + bool markNodeAsMovedAndSetPosition(const DynamicPatriciaTrieNodeReader *const nodeToUpdate, + const int movedPos, const int bigramLinkedNodePos); + + bool writePtNodeWithFullInfoToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool isBlacklisted, const bool isNotAWord, + const int parentPos, const int *const codePoints, const int codePointCount, + const int probability, const int childrenPos, const int originalBigramListPos, + const int originalShortcutListPos, int *const writingPos); + + bool writePtNodeToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const int parentPos, const int *const codePoints, const int codePointCount, + const int probability, int *const writingPos); + + bool createAndInsertNodeIntoPtNodeArray(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability, int *const forwardLinkFieldPos); + + bool setPtNodeProbability(const DynamicPatriciaTrieNodeReader *const originalNode, + const int probability, const int *const codePoints); + + bool createChildrenPtNodeArrayAndAChildPtNode( + const DynamicPatriciaTrieNodeReader *const parentNode, const int probability, + const int *const codePoints, const int codePointCount); + + bool createNewPtNodeArrayWithAChildPtNode(const int parentPos, const int *const nodeCodePoints, + const int nodeCodePointCount, const int probability); + + bool reallocatePtNodeAndAddNewPtNodes( + const DynamicPatriciaTrieNodeReader *const reallocatingPtNode, + const int *const reallocatingPtNodeCodePoints, const int overlappingCodePointCount, + const int probabilityOfNewPtNode, const int *const newNodeCodePoints, + const int newNodeCodePointCount); + + bool runGC(const int rootPtNodeArrayPos, BufferWithExtendableBuffer *const bufferToWrite); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_HELPER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp new file mode 100644 index 000000000..30ff10cd6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" + +#include <cstddef> +#include <cstdlib> +#include <stdint.h> + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD = 0x7F; +const size_t DynamicPatriciaTrieWritingUtils::MAX_PTNODE_ARRAY_SIZE = 0x7FFF; +const int DynamicPatriciaTrieWritingUtils::SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE = 1; +const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE = 2; +const int DynamicPatriciaTrieWritingUtils::LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG = 0x8000; +const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_FIELD_SIZE = 3; +const int DynamicPatriciaTrieWritingUtils::MAX_DICT_OFFSET_VALUE = 0x7FFFFF; +const int DynamicPatriciaTrieWritingUtils::MIN_DICT_OFFSET_VALUE = -0x7FFFFF; +const int DynamicPatriciaTrieWritingUtils::DICT_OFFSET_NEGATIVE_FLAG = 0x800000; +const int DynamicPatriciaTrieWritingUtils::PROBABILITY_FIELD_SIZE = 1; +const int DynamicPatriciaTrieWritingUtils::NODE_FLAG_FIELD_SIZE = 1; + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeEmptyDictionary( + BufferWithExtendableBuffer *const buffer, const int rootPos) { + int writingPos = rootPos; + if (!writePtNodeArraySizeAndAdvancePosition(buffer, 0 /* arraySize */, &writingPos)) { + return false; + } + return writeForwardLinkPositionAndAdvancePosition(buffer, NOT_A_DICT_POS /* forwardLinkPos */, + &writingPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos) { + return writeDictOffset(buffer, forwardLinkPos, (*forwardLinkFieldPos), forwardLinkFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writePtNodeArraySizeAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const size_t arraySize, + int *const arraySizeFieldPos) { + // Currently, all array size field to be created has LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE to + // simplify updating process. + // TODO: Use SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE for small arrays. + /*if (arraySize <= MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD) { + return buffer->writeUintAndAdvancePosition(arraySize, SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else */ + if (arraySize <= MAX_PTNODE_ARRAY_SIZE) { + uint32_t data = arraySize | LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + return buffer->writeUintAndAdvancePosition(data, LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE, + arraySizeFieldPos); + } else { + AKLOGI("PtNode array size cannot be written because arraySize is too large: %zd", + arraySize); + ASSERT(false); + return false; + } +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeFlagsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, + const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, int *const nodeFlagsFieldPos) { + return buffer->writeUintAndAdvancePosition(nodeFlags, NODE_FLAG_FIELD_SIZE, nodeFlagsFieldPos); +} + +// Note that parentOffset is offset from node's head position. +/* static */ bool DynamicPatriciaTrieWritingUtils::writeParentPosOffsetAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int parentPos, const int basePos, + int *const parentPosFieldPos) { + return writeDictOffset(buffer, parentPos, basePos, parentPosFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeCodePointsAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int *const codePoints, + const int codePointCount, int *const codePointFieldPos) { + if (codePointCount <= 0) { + AKLOGI("code points cannot be written because codePointCount is invalid: %d", + codePointCount); + ASSERT(false); + return false; + } + const bool hasMultipleCodePoints = codePointCount > 1; + return buffer->writeCodePointsAndAdvancePosition(codePoints, codePointCount, + hasMultipleCodePoints, codePointFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeProbabilityAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int probability, + int *const probabilityFieldPos) { + if (probability < 0 || probability > MAX_PROBABILITY) { + AKLOGI("probability cannot be written because the probability is invalid: %d", + probability); + ASSERT(false); + return false; + } + return buffer->writeUintAndAdvancePosition(probability, PROBABILITY_FIELD_SIZE, + probabilityFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeChildrenPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int childrenPosition, + int *const childrenPositionFieldPos) { + return writeDictOffset(buffer, childrenPosition, (*childrenPositionFieldPos), + childrenPositionFieldPos); +} + +/* static */ bool DynamicPatriciaTrieWritingUtils::writeDictOffset( + BufferWithExtendableBuffer *const buffer, const int targetPos, const int basePos, + int *const offsetFieldPos) { + int offset = targetPos - basePos; + if (targetPos == NOT_A_DICT_POS) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_INVALID; + } else if (offset == 0) { + offset = DynamicPatriciaTrieReadingUtils::DICT_OFFSET_ZERO_OFFSET; + } + if (offset > MAX_DICT_OFFSET_VALUE || offset < MIN_DICT_OFFSET_VALUE) { + AKLOGI("offset cannot be written because the offset is too large or too small: %d", + offset); + ASSERT(false); + return false; + } + uint32_t data = 0; + if (offset >= 0) { + data = offset; + } else { + data = abs(offset) | DICT_OFFSET_NEGATIVE_FLAG; + } + return buffer->writeUintAndAdvancePosition(data, DICT_OFFSET_FIELD_SIZE, offsetFieldPos); +} +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h new file mode 100644 index 000000000..af76bc6b5 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H +#define LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H + +#include <cstddef> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_reading_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DynamicPatriciaTrieWritingUtils { + public: + static const int NODE_FLAG_FIELD_SIZE; + + static bool writeEmptyDictionary(BufferWithExtendableBuffer *const buffer, const int rootPos); + + static bool writeForwardLinkPositionAndAdvancePosition( + BufferWithExtendableBuffer *const buffer, const int forwardLinkPos, + int *const forwardLinkFieldPos); + + static bool writePtNodeArraySizeAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const size_t arraySize, int *const arraySizeFieldPos); + + static bool writeFlagsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const DynamicPatriciaTrieReadingUtils::NodeFlags nodeFlags, + int *const nodeFlagsFieldPos); + + static bool writeParentPosOffsetAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int parentPosition, const int basePos, int *const parentPosFieldPos); + + static bool writeCodePointsAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int *const codePoints, const int codePointCount, int *const codePointFieldPos); + + static bool writeProbabilityAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int probability, int *const probabilityFieldPos); + + static bool writeChildrenPositionAndAdvancePosition(BufferWithExtendableBuffer *const buffer, + const int childrenPosition, int *const childrenPositionFieldPos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicPatriciaTrieWritingUtils); + + static const size_t MAX_PTNODE_ARRAY_SIZE_TO_USE_SMALL_SIZE_FIELD; + static const size_t MAX_PTNODE_ARRAY_SIZE; + static const int SMALL_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE; + static const int LARGE_PTNODE_ARRAY_SIZE_FIELD_SIZE_FLAG; + static const int DICT_OFFSET_FIELD_SIZE; + static const int MAX_DICT_OFFSET_VALUE; + static const int MIN_DICT_OFFSET_VALUE; + static const int DICT_OFFSET_NEGATIVE_FLAG; + static const int PROBABILITY_FIELD_SIZE; + + static bool writeDictOffset(BufferWithExtendableBuffer *const buffer, const int targetPos, + const int basePos, int *const offsetFieldPos); +}; +} // namespace latinime +#endif /* LATINIME_DYNAMIC_PATRICIA_TRIE_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp new file mode 100644 index 000000000..7bbeacaa0 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_policy.h" + +#include <cstddef> +#include <cstdio> +#include <ctime> + +namespace latinime { + + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE"; +const char *const HeaderPolicy::USES_FORGETTING_CURVE_KEY = "USES_FORGETTING_CURVE"; +const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100; +const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f; + +// Used for logging. Question mark is used to indicate that the key is not found. +void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue, + int outValueSize) const { + if (outValueSize <= 0) return; + if (outValueSize == 1) { + outValue[0] = '\0'; + return; + } + std::vector<int> keyCodePointVector; + HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector); + HeaderReadWriteUtils::AttributeMap::const_iterator it = mAttributeMap.find(keyCodePointVector); + if (it == mAttributeMap.end()) { + // The key was not found. + outValue[0] = '?'; + outValue[1] = '\0'; + return; + } + const int terminalIndex = min(static_cast<int>(it->second.size()), outValueSize - 1); + for (int i = 0; i < terminalIndex; ++i) { + outValue[i] = it->second[i]; + } + outValue[terminalIndex] = '\0'; +} + +float HeaderPolicy::readMultipleWordCostMultiplier() const { + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(MULTIPLE_WORDS_DEMOTION_RATE_KEY, &keyVector); + const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + &keyVector, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE); + if (demotionRate <= 0) { + return static_cast<float>(MAX_VALUE_FOR_WEIGHTING); + } + return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate); +} + +bool HeaderPolicy::readUsesForgettingCurveFlag() const { + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(USES_FORGETTING_CURVE_KEY, &keyVector); + return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, + false /* defaultValue */); +} + +// Returns current time when the key is not found or the value is invalid. +int HeaderPolicy::readLastUpdatedTime() const { + std::vector<int> keyVector; + HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &keyVector); + return HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, &keyVector, + time(0) /* defaultValue */); +} + +bool HeaderPolicy::writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime) const { + int writingPos = 0; + if (!HeaderReadWriteUtils::writeDictionaryVersion(bufferToWrite, mDictFormatVersion, + &writingPos)) { + return false; + } + if (!HeaderReadWriteUtils::writeDictionaryFlags(bufferToWrite, mDictionaryFlags, + &writingPos)) { + return false; + } + // Temporarily writes a dummy header size. + int headerSizeFieldPos = writingPos; + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, 0 /* size */, + &writingPos)) { + return false; + } + if (updatesLastUpdatedTime) { + // Set current time as a last updated time. + HeaderReadWriteUtils::AttributeMap attributeMapTowrite(mAttributeMap); + std::vector<int> updatedTimekey; + HeaderReadWriteUtils::insertCharactersIntoVector(LAST_UPDATED_TIME_KEY, &updatedTimekey); + HeaderReadWriteUtils::setIntAttribute(&attributeMapTowrite, &updatedTimekey, time(0)); + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &attributeMapTowrite, + &writingPos)) { + return false; + } + } else { + if (!HeaderReadWriteUtils::writeHeaderAttributes(bufferToWrite, &mAttributeMap, + &writingPos)) { + return false; + } + } + // Writes an actual header size. + if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(bufferToWrite, writingPos, + &headerSizeFieldPos)) { + return false; + } + return true; +} + +/* static */ HeaderReadWriteUtils::AttributeMap + HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) { + HeaderReadWriteUtils::AttributeMap attributeMap; + HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap); + return attributeMap; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h new file mode 100644 index 000000000..e97c08ca4 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_POLICY_H +#define LATINIME_HEADER_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_header_structure_policy.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +class HeaderPolicy : public DictionaryHeaderStructurePolicy { + public: + // Reads information from existing dictionary buffer. + HeaderPolicy(const uint8_t *const dictBuf, const int dictSize) + : mDictFormatVersion(FormatUtils::detectFormatVersion(dictBuf, dictSize)), + mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), + mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), + mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), + mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), + mUsesForgettingCurve(readUsesForgettingCurveFlag()), + mLastUpdatedTime(readLastUpdatedTime()) {} + + // Constructs header information using an attribute map. + HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) + : mDictFormatVersion(dictFormatVersion), + mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + attributeMap)), mSize(0), mAttributeMap(*attributeMap), + mMultiWordCostMultiplier(readUsesForgettingCurveFlag()), + mUsesForgettingCurve(readUsesForgettingCurveFlag()), + mLastUpdatedTime(readLastUpdatedTime()) {} + + ~HeaderPolicy() {} + + AK_FORCE_INLINE int getSize() const { + return mSize; + } + + AK_FORCE_INLINE bool supportsDynamicUpdate() const { + return HeaderReadWriteUtils::supportsDynamicUpdate(mDictionaryFlags); + } + + AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { + return HeaderReadWriteUtils::requiresGermanUmlautProcessing(mDictionaryFlags); + } + + AK_FORCE_INLINE bool requiresFrenchLigatureProcessing() const { + return HeaderReadWriteUtils::requiresFrenchLigatureProcessing(mDictionaryFlags); + } + + AK_FORCE_INLINE float getMultiWordCostMultiplier() const { + return mMultiWordCostMultiplier; + } + + AK_FORCE_INLINE bool usesForgettingCurve() const { + return mUsesForgettingCurve; + } + + AK_FORCE_INLINE int getLastUpdatedTime() const { + return mLastUpdatedTime; + } + + void readHeaderValueOrQuestionMark(const char *const key, + int *outValue, int outValueSize) const; + + bool writeHeaderToBuffer(BufferWithExtendableBuffer *const bufferToWrite, + const bool updatesLastUpdatedTime) const; + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderPolicy); + + static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; + static const char *const USES_FORGETTING_CURVE_KEY; + static const char *const LAST_UPDATED_TIME_KEY; + static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; + static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; + + const FormatUtils::FORMAT_VERSION mDictFormatVersion; + const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; + const int mSize; + HeaderReadWriteUtils::AttributeMap mAttributeMap; + const float mMultiWordCostMultiplier; + const bool mUsesForgettingCurve; + const int mLastUpdatedTime; + + float readMultipleWordCostMultiplier() const; + + bool readUsesForgettingCurveFlag() const; + + int readLastUpdatedTime() const; + + static HeaderReadWriteUtils::AttributeMap createAttributeMapAndReadAllAttributes( + const uint8_t *const dictBuf); +}; +} // namespace latinime +#endif /* LATINIME_HEADER_POLICY_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp new file mode 100644 index 000000000..3b1c78085 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +#include <cctype> +#include <cstdio> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_KEY_LENGTH = 256; +const int HeaderReadWriteUtils::MAX_ATTRIBUTE_VALUE_LENGTH = 256; + +const int HeaderReadWriteUtils::HEADER_MAGIC_NUMBER_SIZE = 4; +const int HeaderReadWriteUtils::HEADER_DICTIONARY_VERSION_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_FLAG_SIZE = 2; +const int HeaderReadWriteUtils::HEADER_SIZE_FIELD_SIZE = 4; + +const HeaderReadWriteUtils::DictionaryFlags HeaderReadWriteUtils::NO_FLAGS = 0; +// Flags for special processing +// Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAG) or +// something very bad (like, the apocalypse) will happen. Please update both at the same time. +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::GERMAN_UMLAUT_PROCESSING_FLAG = 0x1; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_FLAG = 0x2; +const HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::FRENCH_LIGATURE_PROCESSING_FLAG = 0x4; + +// Note that these are corresponding definitions in Java side in FormatSpec.FileHeader. +const char *const HeaderReadWriteUtils::SUPPORTS_DYNAMIC_UPDATE_KEY = "SUPPORTS_DYNAMIC_UPDATE"; +const char *const HeaderReadWriteUtils::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = + "REQUIRES_GERMAN_UMLAUT_PROCESSING"; +const char *const HeaderReadWriteUtils::REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY = + "REQUIRES_FRENCH_LIGATURE_PROCESSING"; + +/* static */ int HeaderReadWriteUtils::getHeaderSize(const uint8_t *const dictBuf) { + // See the format of the header in the comment in + // BinaryDictionaryFormatUtils::detectFormatVersion() + return ByteArrayUtils::readUint32(dictBuf, HEADER_MAGIC_NUMBER_SIZE + + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::getFlags(const uint8_t *const dictBuf) { + return ByteArrayUtils::readUint16(dictBuf, + HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE); +} + +/* static */ HeaderReadWriteUtils::DictionaryFlags + HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + AttributeMap::key_type key; + insertCharactersIntoVector(REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, &key); + const bool requiresGermanUmlautProcessing = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + key.clear(); + insertCharactersIntoVector(REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY, &key); + const bool requiresFrenchLigatureProcessing = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + key.clear(); + insertCharactersIntoVector(SUPPORTS_DYNAMIC_UPDATE_KEY, &key); + const bool supportsDynamicUpdate = readBoolAttributeValue(attributeMap, &key, + false /* defaultValue */); + DictionaryFlags dictflags = NO_FLAGS; + dictflags |= requiresGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0; + dictflags |= requiresFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0; + dictflags |= supportsDynamicUpdate ? SUPPORTS_DYNAMIC_UPDATE_FLAG : 0; + return dictflags; +} + +/* static */ void HeaderReadWriteUtils::fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes) { + const int headerSize = getHeaderSize(dictBuf); + int pos = getHeaderOptionsPosition(); + if (pos == NOT_A_DICT_POS) { + // The header doesn't have header options. + return; + } + int keyBuffer[MAX_ATTRIBUTE_KEY_LENGTH]; + int valueBuffer[MAX_ATTRIBUTE_VALUE_LENGTH]; + while (pos < headerSize) { + const int keyLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_KEY_LENGTH, keyBuffer, &pos); + std::vector<int> key; + key.insert(key.end(), keyBuffer, keyBuffer + keyLength); + const int valueLength = ByteArrayUtils::readStringAndAdvancePosition(dictBuf, + MAX_ATTRIBUTE_VALUE_LENGTH, valueBuffer, &pos); + std::vector<int> value; + value.insert(value.end(), valueBuffer, valueBuffer + valueLength); + headerAttributes->insert(AttributeMap::value_type(key, value)); + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryVersion( + BufferWithExtendableBuffer *const buffer, const FormatUtils::FORMAT_VERSION version, + int *const writingPos) { + if (!buffer->writeUintAndAdvancePosition(FormatUtils::MAGIC_NUMBER, HEADER_MAGIC_NUMBER_SIZE, + writingPos)) { + return false; + } + switch (version) { + case FormatUtils::VERSION_2: + // Version 2 dictionary writing is not supported. + return false; + case FormatUtils::VERSION_3: + return buffer->writeUintAndAdvancePosition(3 /* data */, + HEADER_DICTIONARY_VERSION_SIZE, writingPos); + default: + return false; + } +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryFlags( + BufferWithExtendableBuffer *const buffer, const DictionaryFlags flags, + int *const writingPos) { + return buffer->writeUintAndAdvancePosition(flags, HEADER_FLAG_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeDictionaryHeaderSize( + BufferWithExtendableBuffer *const buffer, const int size, int *const writingPos) { + return buffer->writeUintAndAdvancePosition(size, HEADER_SIZE_FIELD_SIZE, writingPos); +} + +/* static */ bool HeaderReadWriteUtils::writeHeaderAttributes( + BufferWithExtendableBuffer *const buffer, const AttributeMap *const headerAttributes, + int *const writingPos) { + for (AttributeMap::const_iterator it = headerAttributes->begin(); + it != headerAttributes->end(); ++it) { + // Write a key. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->first.at(0)), it->first.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + // Write a value. + if (!buffer->writeCodePointsAndAdvancePosition(&(it->second.at(0)), it->second.size(), + true /* writesTerminator */, writingPos)) { + return false; + } + } + return true; +} + +/* static */ void HeaderReadWriteUtils::setBoolAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool value) { + setIntAttribute(headerAttributes, key, value ? 1 : 0); +} + +/* static */ void HeaderReadWriteUtils::setIntAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value) { + AttributeMap::mapped_type valueVector; + char charBuf[LARGEST_INT_DIGIT_COUNT + 1]; + snprintf(charBuf, LARGEST_INT_DIGIT_COUNT + 1, "%d", value); + insertCharactersIntoVector(charBuf, &valueVector); + (*headerAttributes)[*key] = valueVector; +} + +/* static */ bool HeaderReadWriteUtils::readBoolAttributeValue( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const bool defaultValue) { + const int intDefaultValue = defaultValue ? 1 : 0; + const int intValue = readIntAttributeValue(headerAttributes, key, intDefaultValue); + return intValue != 0; +} + +/* static */ int HeaderReadWriteUtils::readIntAttributeValue( + const AttributeMap *const headerAttributes, const AttributeMap::key_type *const key, + const int defaultValue) { + AttributeMap::const_iterator it = headerAttributes->find(*key); + if (it != headerAttributes->end()) { + int value = 0; + bool isNegative = false; + for (size_t i = 0; i < it->second.size(); ++i) { + if (i == 0 && it->second.at(i) == '-') { + isNegative = true; + } else { + if (!isdigit(it->second.at(i))) { + // If not a number. + return defaultValue; + } + value *= 10; + value += it->second.at(i) - '0'; + } + } + return isNegative ? -value : value; + } + return defaultValue; +} + +/* static */ void HeaderReadWriteUtils::insertCharactersIntoVector(const char *const characters, + std::vector<int> *const vector) { + for (int i = 0; characters[i]; ++i) { + vector->push_back(characters[i]); + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h new file mode 100644 index 000000000..caa5097f6 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_read_write_utils.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_HEADER_READ_WRITE_UTILS_H +#define LATINIME_HEADER_READ_WRITE_UTILS_H + +#include <map> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class HeaderReadWriteUtils { + public: + typedef uint16_t DictionaryFlags; + typedef std::map<std::vector<int>, std::vector<int> > AttributeMap; + + static int getHeaderSize(const uint8_t *const dictBuf); + + static DictionaryFlags getFlags(const uint8_t *const dictBuf); + + static AK_FORCE_INLINE bool supportsDynamicUpdate(const DictionaryFlags flags) { + return (flags & SUPPORTS_DYNAMIC_UPDATE_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresGermanUmlautProcessing(const DictionaryFlags flags) { + return (flags & GERMAN_UMLAUT_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE bool requiresFrenchLigatureProcessing(const DictionaryFlags flags) { + return (flags & FRENCH_LIGATURE_PROCESSING_FLAG) != 0; + } + + static AK_FORCE_INLINE int getHeaderOptionsPosition() { + return HEADER_MAGIC_NUMBER_SIZE + HEADER_DICTIONARY_VERSION_SIZE + HEADER_FLAG_SIZE + + HEADER_SIZE_FIELD_SIZE; + } + + static DictionaryFlags createAndGetDictionaryFlagsUsingAttributeMap( + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static void fetchAllHeaderAttributes(const uint8_t *const dictBuf, + AttributeMap *const headerAttributes); + + static bool writeDictionaryVersion(BufferWithExtendableBuffer *const buffer, + const FormatUtils::FORMAT_VERSION version, int *const writingPos); + + static bool writeDictionaryFlags(BufferWithExtendableBuffer *const buffer, + const DictionaryFlags flags, int *const writingPos); + + static bool writeDictionaryHeaderSize(BufferWithExtendableBuffer *const buffer, + const int size, int *const writingPos); + + static bool writeHeaderAttributes(BufferWithExtendableBuffer *const buffer, + const AttributeMap *const headerAttributes, int *const writingPos); + + /** + * Methods for header attributes. + */ + static void setBoolAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool value); + + static void setIntAttribute(AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int value); + + static bool readBoolAttributeValue(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const bool defaultValue); + + static int readIntAttributeValue(const AttributeMap *const headerAttributes, + const AttributeMap::key_type *const key, const int defaultValue); + + static void insertCharactersIntoVector(const char *const characters, + AttributeMap::key_type *const key); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(HeaderReadWriteUtils); + + static const int MAX_ATTRIBUTE_KEY_LENGTH; + static const int MAX_ATTRIBUTE_VALUE_LENGTH; + + static const int HEADER_MAGIC_NUMBER_SIZE; + static const int HEADER_DICTIONARY_VERSION_SIZE; + static const int HEADER_FLAG_SIZE; + static const int HEADER_SIZE_FIELD_SIZE; + + static const DictionaryFlags NO_FLAGS; + // Flags for special processing + // Those *must* match the flags in makedict (FormatSpec#*_PROCESSING_FLAGS) or + // something very bad (like, the apocalypse) will happen. Please update both at the same time. + static const DictionaryFlags GERMAN_UMLAUT_PROCESSING_FLAG; + static const DictionaryFlags SUPPORTS_DYNAMIC_UPDATE_FLAG; + static const DictionaryFlags FRENCH_LIGATURE_PROCESSING_FLAG; + + static const char *const SUPPORTS_DYNAMIC_UPDATE_KEY; + static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; + static const char *const REQUIRES_FRENCH_LIGATURE_PROCESSING_KEY; +}; +} +#endif /* LATINIME_HEADER_READ_WRITE_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp new file mode 100644 index 000000000..8a84bd261 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.cpp @@ -0,0 +1,433 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "suggest/policyimpl/dictionary/patricia_trie_policy.h" + +#include "defines.h" +#include "suggest/core/dicnode/dic_node.h" +#include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/probability_utils.h" + +namespace latinime { + +void PatriciaTriePolicy::createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const { + if (!dicNode->hasChildren()) { + return; + } + int nextPos = dicNode->getChildrenPos(); + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Children PtNode array position is invalid. pos: %d, dict size: %d", + nextPos, mDictBufferSize); + ASSERT(false); + return; + } + const int childCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &nextPos); + for (int i = 0; i < childCount; i++) { + if (nextPos < 0 || nextPos >= mDictBufferSize) { + AKLOGE("Child PtNode position is invalid. pos: %d, dict size: %d, childCount: %d / %d", + nextPos, mDictBufferSize, i, childCount); + ASSERT(false); + return; + } + nextPos = createAndGetLeavingChildNode(dicNode, nextPos, childDicNodes); + } +} + +// This retrieves code points and the probability of the word by its terminal position. +// Due to the fact that words are ordered in the dictionary in a strict breadth-first order, +// it is possible to check for this with advantageous complexity. For each node, we search +// for PtNodes with children and compare the children position with the position we look for. +// When we shoot the position we look for, it means the word we look for is in the children +// of the previous PtNode. The only tricky part is the fact that if we arrive at the end of a +// PtNode array with the last PtNode's children position still less than what we are searching for, +// we must descend the last PtNode's children (for example, if the word we are searching for starts +// with a z, it's the last PtNode of the root array, so all children addresses will be smaller +// than the position we look for, and we have to descend the z node). +/* Parameters : + * ptNodePos: the byte position of the terminal PtNode of the word we are searching for (this is + * what is stored as the "bigram position" in each bigram) + * outCodePoints: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramProbability: a pointer to an int to write the probability into. + * Return value : the code point count, of 0 if the word was not found. + */ +// TODO: Split this function to be more readable +int PatriciaTriePolicy::getCodePointsAndProbabilityAndReturnCodePointCount( + const int ptNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const { + int pos = getRootPosition(); + int wordPos = 0; + // One iteration of the outer loop iterates through PtNode arrays. As stated above, we will + // only traverse nodes that are actually a part of the terminal we are searching, so each time + // we enter this loop we are one depth level further than last time. + // The only reason we count nodes is because we want to reduce the probability of infinite + // looping in case there is a bug. Since we know there is an upper bound to the depth we are + // supposed to traverse, it does not hurt to count iterations. + for (int loopCount = maxCodePointCount; loopCount > 0; --loopCount) { + int lastCandidatePtNodePos = 0; + // Let's loop through PtNodes in this PtNode array searching for either the terminal + // or one of its ascendants. + for (int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition( + mDictRoot, &pos); ptNodeCount > 0; --ptNodeCount) { + const int startPos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + const int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + if (ptNodePos == startPos) { + // We found the position. Copy the rest of the code points in the buffer and return + // the length. + outCodePoints[wordPos] = character; + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + // We count code points in order to avoid infinite loops if the file is broken + // or if there is some other bug + int charCount = maxCodePointCount; + while (NOT_A_CODE_POINT != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + *outUnigramProbability = + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &pos); + return ++wordPos; + } + // We need to skip past this PtNode, so skip any remaining code points after the + // first and possibly the probability. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + // The fact that this PtNode has children is very important. Since we already know + // that this PtNode does not match, if it has no children we know it is irrelevant + // to what we are searching for. + const bool hasChildren = PatriciaTrieReadingUtils::hasChildrenInFlags(flags); + // We will write in `found' whether we have passed the children position we are + // searching for. For example if we search for "beer", the children of b are less + // than the address we are searching for and the children of c are greater. When we + // come here for c, we realize this is too big, and that we should descend b. + bool found; + if (hasChildren) { + int currentPos = pos; + // Here comes the tricky part. First, read the children position. + const int childrenPos = PatriciaTrieReadingUtils + ::readChildrenPositionAndAdvancePosition(mDictRoot, flags, ¤tPos); + if (childrenPos > ptNodePos) { + // If the children pos is greater than the position, it means the previous + // PtNode, which position is stored in lastCandidatePtNodePos, was the right + // one. + found = true; + } else if (1 >= ptNodeCount) { + // However if we are on the LAST PtNode of this array, and we have NOT shot the + // position we should descend THIS node. So we trick the lastCandidatePtNodePos + // so that we will descend this PtNode, not the previous one. + lastCandidatePtNodePos = startPos; + found = true; + } else { + // Else, we should continue looking. + found = false; + } + } else { + // Even if we don't have children here, we could still be on the last PtNode of / + // this array. If this is the case, we should descend the last PtNode that had + // children, and their position is already in lastCandidatePtNodePos. + found = (1 >= ptNodeCount); + } + + if (found) { + // Okay, we found the PtNode we should descend. Its position is in + // the lastCandidatePtNodePos variable, so we just re-read it. + if (0 != lastCandidatePtNodePos) { + const PatriciaTrieReadingUtils::NodeFlags lastFlags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + const int lastChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + // We copy all the characters in this PtNode to the buffer + outCodePoints[wordPos] = lastChar; + if (PatriciaTrieReadingUtils::hasMultipleChars(lastFlags)) { + int nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + int charCount = maxCodePointCount; + while (-1 != nextChar && --charCount > 0) { + outCodePoints[++wordPos] = nextChar; + nextChar = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &lastCandidatePtNodePos); + } + } + ++wordPos; + // Now we only need to branch to the children address. Skip the probability if + // it's there, read pos, and break to resume the search at pos. + if (PatriciaTrieReadingUtils::isTerminal(lastFlags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, + &lastCandidatePtNodePos); + } + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, lastFlags, &lastCandidatePtNodePos); + break; + } else { + // Here is a little tricky part: we come here if we found out that all children + // addresses in this PtNode are bigger than the address we are searching for. + // Should we conclude the word is not in the dictionary? No! It could still be + // one of the remaining PtNodes in this array, so we have to keep looking in + // this array until we find it (or we realize it's not there either, in which + // case it's actually not in the dictionary). Pass the end of this PtNode, + // ready to start the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + } else { + // If we did not find it, we should record the last children address for the next + // iteration. + if (hasChildren) lastCandidatePtNodePos = startPos; + // Now skip the end of this PtNode (children pos and the attributes if any) so that + // our pos is after the end of this PtNode, at the start of the next one. + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + + } + } + // If we have looked through all the PtNodes and found no match, the ptNodePos is + // not the position of a terminal in this dictionary. + return 0; +} + +// This function gets the position of the terminal node of the exact matching word in the +// dictionary. If no match is found, it returns NOT_A_DICT_POS. +int PatriciaTriePolicy::getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const { + int pos = getRootPosition(); + int wordPos = 0; + + while (true) { + // If we already traversed the tree further than the word is long, there means + // there was no match (or we would have found it). + if (wordPos >= length) return NOT_A_DICT_POS; + int ptNodeCount = PatriciaTrieReadingUtils::getPtNodeArraySizeAndAdvancePosition(mDictRoot, + &pos); + const int wChar = forceLowerCaseSearch + ? CharUtils::toLowerCase(inWord[wordPos]) : inWord[wordPos]; + while (true) { + // If there are no more PtNodes in this array, it means we could not + // find a matching character for this depth, therefore there is no match. + if (0 >= ptNodeCount) return NOT_A_DICT_POS; + const int ptNodePos = pos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + int character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + if (character == wChar) { + // This is the correct PtNode. Only one PtNode may start with the same char within + // a PtNode array, so either we found our match in this array, or there is + // no match and we can return NOT_A_DICT_POS. So we will check all the + // characters in this PtNode indeed does match. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition(mDictRoot, + &pos); + while (NOT_A_CODE_POINT != character) { + ++wordPos; + // If we shoot the length of the word we search for, or if we find a single + // character that does not match, as explained above, it means the word is + // not in the dictionary (by virtue of this PtNode being the only one to + // match the word on the first character, but not matching the whole word). + if (wordPos >= length) return NOT_A_DICT_POS; + if (inWord[wordPos] != character) return NOT_A_DICT_POS; + character = PatriciaTrieReadingUtils::getCodePointAndAdvancePosition( + mDictRoot, &pos); + } + } + // If we come here we know that so far, we do match. Either we are on a terminal + // and we match the length, in which case we found it, or we traverse children. + // If we don't match the length AND don't have children, then a word in the + // dictionary fully matches a prefix of the searched word but not the full word. + ++wordPos; + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + if (wordPos == length) { + return ptNodePos; + } + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (!PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + return NOT_A_DICT_POS; + } + // We have children and we are still shorter than the word we are searching for, so + // we need to traverse children. Put the pointer on the children position, and + // break + pos = PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + break; + } else { + // This PtNode does not match, so skip the remaining part and go to the next. + if (PatriciaTrieReadingUtils::hasMultipleChars(flags)) { + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, + &pos); + } + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, + flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + mBigramListPolicy.skipAllBigrams(&pos); + } + } + --ptNodeCount; + } + } +} + +int PatriciaTriePolicy::getProbability(const int unigramProbability, + const int bigramProbability) const { + if (unigramProbability == NOT_A_PROBABILITY) { + return NOT_A_PROBABILITY; + } else if (bigramProbability == NOT_A_PROBABILITY) { + return ProbabilityUtils::backoff(unigramProbability); + } else { + return ProbabilityUtils::computeProbabilityForBigram(unigramProbability, + bigramProbability); + } +} + +int PatriciaTriePolicy::getUnigramProbabilityOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_PROBABILITY; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::isTerminal(flags)) { + return NOT_A_PROBABILITY; + } + if (PatriciaTrieReadingUtils::isNotAWord(flags) + || PatriciaTrieReadingUtils::isBlacklisted(flags)) { + // If this is not a word, or if it's a blacklisted entry, it should behave as + // having no probability outside of the suggestion process (where it should be used + // for shortcuts). + return NOT_A_PROBABILITY; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + return getProbability(PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition( + mDictRoot, &pos), NOT_A_PROBABILITY); +} + +int PatriciaTriePolicy::getShortcutPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + return NOT_A_DICT_POS; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); + } + return pos; +} + +int PatriciaTriePolicy::getBigramsPositionOfPtNode(const int ptNodePos) const { + if (ptNodePos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + if (!PatriciaTrieReadingUtils::hasBigrams(flags)) { + return NOT_A_DICT_POS; + } + PatriciaTrieReadingUtils::skipCharacters(mDictRoot, flags, MAX_WORD_LENGTH, &pos); + if (PatriciaTrieReadingUtils::isTerminal(flags)) { + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos); + } + if (PatriciaTrieReadingUtils::hasChildrenInFlags(flags)) { + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition(mDictRoot, flags, &pos); + } + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + mShortcutListPolicy.skipAllShortcuts(&pos);; + } + return pos; +} + +int PatriciaTriePolicy::createAndGetLeavingChildNode(const DicNode *const dicNode, + const int ptNodePos, DicNodeVector *childDicNodes) const { + int pos = ptNodePos; + const PatriciaTrieReadingUtils::NodeFlags flags = + PatriciaTrieReadingUtils::getFlagsAndAdvancePosition(mDictRoot, &pos); + int mergedNodeCodePoints[MAX_WORD_LENGTH]; + const int mergedNodeCodePointCount = PatriciaTrieReadingUtils::getCharsAndAdvancePosition( + mDictRoot, flags, MAX_WORD_LENGTH, mergedNodeCodePoints, &pos); + const int probability = (PatriciaTrieReadingUtils::isTerminal(flags))? + PatriciaTrieReadingUtils::readProbabilityAndAdvancePosition(mDictRoot, &pos) + : NOT_A_PROBABILITY; + const int childrenPos = PatriciaTrieReadingUtils::hasChildrenInFlags(flags) ? + PatriciaTrieReadingUtils::readChildrenPositionAndAdvancePosition( + mDictRoot, flags, &pos) : NOT_A_DICT_POS; + if (PatriciaTrieReadingUtils::hasShortcutTargets(flags)) { + getShortcutsStructurePolicy()->skipAllShortcuts(&pos); + } + if (PatriciaTrieReadingUtils::hasBigrams(flags)) { + getBigramsStructurePolicy()->skipAllBigrams(&pos); + } + if (mergedNodeCodePointCount <= 0) { + AKLOGE("Empty PtNode is not allowed. Code point count: %d", mergedNodeCodePointCount); + ASSERT(false); + return pos; + } + childDicNodes->pushLeavingChild(dicNode, ptNodePos, childrenPos, probability, + PatriciaTrieReadingUtils::isTerminal(flags), + PatriciaTrieReadingUtils::hasChildrenInFlags(flags), + PatriciaTrieReadingUtils::isBlacklisted(flags) || + PatriciaTrieReadingUtils::isNotAWord(flags), + mergedNodeCodePointCount, mergedNodeCodePoints); + return pos; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h new file mode 100644 index 000000000..f1de914cb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_policy.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_POLICY_H +#define LATINIME_PATRICIA_TRIE_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" +#include "suggest/policyimpl/dictionary/bigram/bigram_list_policy.h" +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h" +#include "suggest/policyimpl/dictionary/utils/mmapped_buffer.h" + +namespace latinime { + +class DicNode; +class DicNodeVector; + +class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { + public: + PatriciaTriePolicy(const MmappedBuffer *const buffer) + : mBuffer(buffer), mHeaderPolicy(mBuffer->getBuffer(), buffer->getBufferSize()), + mDictRoot(mBuffer->getBuffer() + mHeaderPolicy.getSize()), + mDictBufferSize(mBuffer->getBufferSize() - mHeaderPolicy.getSize()), + mBigramListPolicy(mDictRoot), mShortcutListPolicy(mDictRoot) {} + + ~PatriciaTriePolicy() { + delete mBuffer; + } + + AK_FORCE_INLINE int getRootPosition() const { + return 0; + } + + void createAndGetAllChildNodes(const DicNode *const dicNode, + DicNodeVector *const childDicNodes) const; + + int getCodePointsAndProbabilityAndReturnCodePointCount( + const int terminalNodePos, const int maxCodePointCount, int *const outCodePoints, + int *const outUnigramProbability) const; + + int getTerminalNodePositionOfWord(const int *const inWord, + const int length, const bool forceLowerCaseSearch) const; + + int getProbability(const int unigramProbability, const int bigramProbability) const; + + int getUnigramProbabilityOfPtNode(const int ptNodePos) const; + + int getShortcutPositionOfPtNode(const int ptNodePos) const; + + int getBigramsPositionOfPtNode(const int ptNodePos) const; + + const DictionaryHeaderStructurePolicy *getHeaderStructurePolicy() const { + return &mHeaderPolicy; + } + + const DictionaryBigramsStructurePolicy *getBigramsStructurePolicy() const { + return &mBigramListPolicy; + } + + const DictionaryShortcutsStructurePolicy *getShortcutsStructurePolicy() const { + return &mShortcutListPolicy; + } + + bool addUnigramWord(const int *const word, const int length, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addUnigramWord() is called for non-updatable dictionary."); + return false; + } + + bool addBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1, const int probability) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: addBigramWords() is called for non-updatable dictionary."); + return false; + } + + bool removeBigramWords(const int *const word0, const int length0, const int *const word1, + const int length1) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: removeBigramWords() is called for non-updatable dictionary."); + return false; + } + + void flush(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flush() is called for non-updatable dictionary."); + } + + void flushWithGC(const char *const filePath) { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: flushWithGC() is called for non-updatable dictionary."); + } + + bool needsToRunGC() const { + // This method should not be called for non-updatable dictionary. + AKLOGI("Warning: needsToRunGC() is called for non-updatable dictionary."); + return false; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTriePolicy); + + const MmappedBuffer *const mBuffer; + const HeaderPolicy mHeaderPolicy; + const uint8_t *const mDictRoot; + const int mDictBufferSize; + const BigramListPolicy mBigramListPolicy; + const ShortcutListPolicy mShortcutListPolicy; + + int createAndGetLeavingChildNode(const DicNode *const dicNode, const int ptNodePos, + DicNodeVector *const childDicNodes) const; +}; +} // namespace latinime +#endif // LATINIME_PATRICIA_TRIE_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp new file mode 100644 index 000000000..7df55815f --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/patricia_trie_reading_utils.h" + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +typedef PatriciaTrieReadingUtils PtReadingUtils; + +const PtReadingUtils::NodeFlags PtReadingUtils::MASK_CHILDREN_POSITION_TYPE = 0xC0; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_NOPOSITION = 0x00; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_ONEBYTE = 0x40; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_TWOBYTES = 0x80; +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_CHILDREN_POSITION_TYPE_THREEBYTES = 0xC0; + +// Flag for single/multiple char group +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_MULTIPLE_CHARS = 0x20; +// Flag for terminal PtNodes +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_TERMINAL = 0x10; +// Flag for shortcut targets presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_SHORTCUT_TARGETS = 0x08; +// Flag for bigram presence +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_HAS_BIGRAMS = 0x04; +// Flag for non-words (typically, shortcut only entries) +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_NOT_A_WORD = 0x02; +// Flag for blacklist +const PtReadingUtils::NodeFlags PtReadingUtils::FLAG_IS_BLACKLISTED = 0x01; + +/* static */ int PtReadingUtils::getPtNodeArraySizeAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + if (firstByte < 0x80) { + return firstByte; + } else { + return ((firstByte & 0x7F) << 8) ^ ByteArrayUtils::readUint8AndAdvancePosition( + buffer, pos); + } +} + +/* static */ PtReadingUtils::NodeFlags PtReadingUtils::getFlagsAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::getCodePointAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readCodePointAndAdvancePosition(buffer, pos); +} + +// Returns the number of read characters. +/* static */ int PtReadingUtils::getCharsAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, const int maxLength, int *const outBuffer, int *const pos) { + int length = 0; + if (hasMultipleChars(flags)) { + length = ByteArrayUtils::readStringAndAdvancePosition(buffer, maxLength, outBuffer, + pos); + } else { + const int codePoint = getCodePointAndAdvancePosition(buffer, pos); + if (codePoint == NOT_A_CODE_POINT) { + // CAVEAT: codePoint == NOT_A_CODE_POINT means the code point is + // CHARACTER_ARRAY_TERMINATOR. The code point must not be CHARACTER_ARRAY_TERMINATOR + // when the PtNode has a single code point. + length = 0; + AKLOGE("codePoint is NOT_A_CODE_POINT. pos: %d, codePoint: 0x%x, buffer[pos - 1]: 0x%x", + *pos - 1, codePoint, buffer[*pos - 1]); + ASSERT(false); + } else if (maxLength > 0) { + outBuffer[0] = codePoint; + length = 1; + } + } + return length; +} + +// Returns the number of skipped characters. +/* static */ int PtReadingUtils::skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const pos) { + if (hasMultipleChars(flags)) { + return ByteArrayUtils::advancePositionToBehindString(buffer, maxLength, pos); + } else { + if (maxLength > 0) { + getCodePointAndAdvancePosition(buffer, pos); + return 1; + } else { + return 0; + } + } +} + +/* static */ int PtReadingUtils::readProbabilityAndAdvancePosition(const uint8_t *const buffer, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); +} + +/* static */ int PtReadingUtils::readChildrenPositionAndAdvancePosition( + const uint8_t *const buffer, const NodeFlags flags, int *const pos) { + const int base = *pos; + int offset = 0; + switch (MASK_CHILDREN_POSITION_TYPE & flags) { + case FLAG_CHILDREN_POSITION_TYPE_ONEBYTE: + offset = ByteArrayUtils::readUint8AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_TWOBYTES: + offset = ByteArrayUtils::readUint16AndAdvancePosition(buffer, pos); + break; + case FLAG_CHILDREN_POSITION_TYPE_THREEBYTES: + offset = ByteArrayUtils::readUint24AndAdvancePosition(buffer, pos); + break; + default: + // If we come here, it means we asked for the children of a word with + // no children. + return NOT_A_DICT_POS; + } + return base + offset; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h new file mode 100644 index 000000000..8420ee95a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/patricia_trie_reading_utils.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PATRICIA_TRIE_READING_UTILS_H +#define LATINIME_PATRICIA_TRIE_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class PatriciaTrieReadingUtils { + public: + typedef uint8_t NodeFlags; + + static int getPtNodeArraySizeAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static NodeFlags getFlagsAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int getCodePointAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + // Returns the number of read characters. + static int getCharsAndAdvancePosition(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const outBuffer, int *const pos); + + // Returns the number of skipped characters. + static int skipCharacters(const uint8_t *const buffer, const NodeFlags flags, + const int maxLength, int *const pos); + + static int readProbabilityAndAdvancePosition(const uint8_t *const buffer, int *const pos); + + static int readChildrenPositionAndAdvancePosition(const uint8_t *const buffer, + const NodeFlags flags, int *const pos); + + /** + * Node Flags + */ + static AK_FORCE_INLINE bool isBlacklisted(const NodeFlags flags) { + return (flags & FLAG_IS_BLACKLISTED) != 0; + } + + static AK_FORCE_INLINE bool isNotAWord(const NodeFlags flags) { + return (flags & FLAG_IS_NOT_A_WORD) != 0; + } + + static AK_FORCE_INLINE bool isTerminal(const NodeFlags flags) { + return (flags & FLAG_IS_TERMINAL) != 0; + } + + static AK_FORCE_INLINE bool hasShortcutTargets(const NodeFlags flags) { + return (flags & FLAG_HAS_SHORTCUT_TARGETS) != 0; + } + + static AK_FORCE_INLINE bool hasBigrams(const NodeFlags flags) { + return (flags & FLAG_HAS_BIGRAMS) != 0; + } + + static AK_FORCE_INLINE bool hasMultipleChars(const NodeFlags flags) { + return (flags & FLAG_HAS_MULTIPLE_CHARS) != 0; + } + + static AK_FORCE_INLINE bool hasChildrenInFlags(const NodeFlags flags) { + return FLAG_CHILDREN_POSITION_TYPE_NOPOSITION != (MASK_CHILDREN_POSITION_TYPE & flags); + } + + static AK_FORCE_INLINE NodeFlags createAndGetFlags(const bool isBlacklisted, + const bool isNotAWord, const bool isTerminal, const bool hasShortcutTargets, + const bool hasBigrams, const bool hasMultipleChars, + const int childrenPositionFieldSize) { + NodeFlags nodeFlags = 0; + nodeFlags = isBlacklisted ? (nodeFlags | FLAG_IS_BLACKLISTED) : nodeFlags; + nodeFlags = isNotAWord ? (nodeFlags | FLAG_IS_NOT_A_WORD) : nodeFlags; + nodeFlags = isTerminal ? (nodeFlags | FLAG_IS_TERMINAL) : nodeFlags; + nodeFlags = hasShortcutTargets ? (nodeFlags | FLAG_HAS_SHORTCUT_TARGETS) : nodeFlags; + nodeFlags = hasBigrams ? (nodeFlags | FLAG_HAS_BIGRAMS) : nodeFlags; + nodeFlags = hasMultipleChars ? (nodeFlags | FLAG_HAS_MULTIPLE_CHARS) : nodeFlags; + if (childrenPositionFieldSize == 1) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + } else if (childrenPositionFieldSize == 2) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + } else if (childrenPositionFieldSize == 3) { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + } else { + nodeFlags |= FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + } + return nodeFlags; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(PatriciaTrieReadingUtils); + + static const NodeFlags MASK_CHILDREN_POSITION_TYPE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_NOPOSITION; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_ONEBYTE; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_TWOBYTES; + static const NodeFlags FLAG_CHILDREN_POSITION_TYPE_THREEBYTES; + + static const NodeFlags FLAG_HAS_MULTIPLE_CHARS; + static const NodeFlags FLAG_IS_TERMINAL; + static const NodeFlags FLAG_HAS_SHORTCUT_TARGETS; + static const NodeFlags FLAG_HAS_BIGRAMS; + static const NodeFlags FLAG_IS_NOT_A_WORD; + static const NodeFlags FLAG_IS_BLACKLISTED; +}; +} // namespace latinime +#endif /* LATINIME_PATRICIA_TRIE_NODE_READING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h new file mode 100644 index 000000000..bd3211f6a --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/dynamic_shortcut_list_policy.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H +#define LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +/* + * This is a dynamic version of ShortcutListPolicy and supports an additional buffer. + */ +class DynamicShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit DynamicShortcutListPolicy(const BufferWithExtendableBuffer *const buffer) + : mBuffer(buffer) {} + + ~DynamicShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + return pos + ShortcutListReadingUtils::getShortcutListSizeFieldSize(); + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(buffer, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + buffer, maxCodePointCount, outCodePoint, pos); + } + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + void skipAllShortcuts(int *const pos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*pos); + const uint8_t *const buffer = mBuffer->getBuffer(usesAdditionalBuffer); + if (usesAdditionalBuffer) { + *pos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(buffer, pos); + *pos += shortcutListSize; + if (usesAdditionalBuffer) { + *pos += mBuffer->getOriginalBufferSize(); + } + } + + // Copy shortcuts from the shortcut list that starts at fromPos in mBuffer to toPos in + // bufferToWrite and advance these positions after the shortcut lists. This returns whether + // the copy was succeeded or not. + bool copyAllShortcutsAndReturnIfSucceededOrNot(BufferWithExtendableBuffer *const bufferToWrite, + int *const fromPos, int *const toPos) const { + const bool usesAdditionalBuffer = mBuffer->isInAdditionalBuffer(*fromPos); + if (usesAdditionalBuffer) { + *fromPos -= mBuffer->getOriginalBufferSize(); + } + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mBuffer->getBuffer(usesAdditionalBuffer), + fromPos); + // Copy shortcut list size. + if (!bufferToWrite->writeUintAndAdvancePosition( + shortcutListSize + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), + ShortcutListReadingUtils::getShortcutListSizeFieldSize(), toPos)) { + return false; + } + // Copy shortcut list. + for (int i = 0; i < shortcutListSize; ++i) { + const uint8_t data = ByteArrayUtils::readUint8AndAdvancePosition( + mBuffer->getBuffer(usesAdditionalBuffer), fromPos); + if (!bufferToWrite->writeUintAndAdvancePosition(data, 1 /* size */, toPos)) { + return false; + } + } + if (usesAdditionalBuffer) { + *fromPos += mBuffer->getOriginalBufferSize(); + } + return true; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DynamicShortcutListPolicy); + + const BufferWithExtendableBuffer *const mBuffer; +}; +} // namespace latinime +#endif // LATINIME_DYNAMIC_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h new file mode 100644 index 000000000..d73f73953 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_policy.h @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_POLICY_H +#define LATINIME_SHORTCUT_LIST_POLICY_H + +#include <stdint.h> + +#include "defines.h" +#include "suggest/core/policy/dictionary_shortcuts_structure_policy.h" +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +namespace latinime { + +class ShortcutListPolicy : public DictionaryShortcutsStructurePolicy { + public: + explicit ShortcutListPolicy(const uint8_t *const shortcutBuf) + : mShortcutsBuf(shortcutBuf) {} + + ~ShortcutListPolicy() {} + + int getStartPos(const int pos) const { + if (pos == NOT_A_DICT_POS) { + return NOT_A_DICT_POS; + } + int listPos = pos; + ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer(mShortcutsBuf, &listPos); + return listPos; + } + + void getNextShortcut(const int maxCodePointCount, int *const outCodePoint, + int *const outCodePointCount, bool *const outIsWhitelist, bool *const outHasNext, + int *const pos) const { + const ShortcutListReadingUtils::ShortcutFlags flags = + ShortcutListReadingUtils::getFlagsAndForwardPointer(mShortcutsBuf, pos); + if (outHasNext) { + *outHasNext = ShortcutListReadingUtils::hasNext(flags); + } + if (outIsWhitelist) { + *outIsWhitelist = ShortcutListReadingUtils::isWhitelist(flags); + } + if (outCodePoint) { + *outCodePointCount = ShortcutListReadingUtils::readShortcutTarget( + mShortcutsBuf, maxCodePointCount, outCodePoint, pos); + } + } + + void skipAllShortcuts(int *const pos) const { + const int shortcutListSize = ShortcutListReadingUtils + ::getShortcutListSizeAndForwardPointer(mShortcutsBuf, pos); + *pos += shortcutListSize; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListPolicy); + + const uint8_t *const mShortcutsBuf; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_POLICY_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp new file mode 100644 index 000000000..847dcdee5 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// Flag for presence of more attributes +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::FLAG_ATTRIBUTE_HAS_NEXT = 0x80; +// Mask for attribute probability, stored on 4 bits inside the flags byte. +const ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::MASK_ATTRIBUTE_PROBABILITY = 0x0F; +const int ShortcutListReadingUtils::SHORTCUT_LIST_SIZE_FIELD_SIZE = 2; +// The numeric value of the shortcut probability that means 'whitelist'. +const int ShortcutListReadingUtils::WHITELIST_SHORTCUT_PROBABILITY = 15; + +/* static */ ShortcutListReadingUtils::ShortcutFlags + ShortcutListReadingUtils::getFlagsAndForwardPointer(const uint8_t *const dictRoot, + int *const pos) { + return ByteArrayUtils::readUint8AndAdvancePosition(dictRoot, pos); +} + +/* static */ int ShortcutListReadingUtils::getShortcutListSizeAndForwardPointer( + const uint8_t *const dictRoot, int *const pos) { + // readUint16andAdvancePosition() returns an offset *including* the uint16 field itself. + return ByteArrayUtils::readUint16AndAdvancePosition(dictRoot, pos) + - SHORTCUT_LIST_SIZE_FIELD_SIZE; +} + +/* static */ int ShortcutListReadingUtils::readShortcutTarget( + const uint8_t *const dictRoot, const int maxLength, int *const outWord, int *const pos) { + return ByteArrayUtils::readStringAndAdvancePosition(dictRoot, maxLength, outWord, pos); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h new file mode 100644 index 000000000..a83ed5a50 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/shortcut/shortcut_list_reading_utils.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SHORTCUT_LIST_READING_UTILS_H +#define LATINIME_SHORTCUT_LIST_READING_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ShortcutListReadingUtils { + public: + typedef uint8_t ShortcutFlags; + + static ShortcutFlags getFlagsAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + + static AK_FORCE_INLINE int getProbabilityFromFlags(const ShortcutFlags flags) { + return flags & MASK_ATTRIBUTE_PROBABILITY; + } + + static AK_FORCE_INLINE bool hasNext(const ShortcutFlags flags) { + return (flags & FLAG_ATTRIBUTE_HAS_NEXT) != 0; + } + + // This method returns the size of the shortcut list region excluding the shortcut list size + // field at the beginning. + static int getShortcutListSizeAndForwardPointer(const uint8_t *const dictRoot, int *const pos); + + static AK_FORCE_INLINE int getShortcutListSizeFieldSize() { + return SHORTCUT_LIST_SIZE_FIELD_SIZE; + } + + static AK_FORCE_INLINE void skipShortcuts(const uint8_t *const dictRoot, int *const pos) { + const int shortcutListSize = getShortcutListSizeAndForwardPointer(dictRoot, pos); + *pos += shortcutListSize; + } + + static AK_FORCE_INLINE bool isWhitelist(const ShortcutFlags flags) { + return getProbabilityFromFlags(flags) == WHITELIST_SHORTCUT_PROBABILITY; + } + + static int readShortcutTarget(const uint8_t *const dictRoot, const int maxLength, + int *const outWord, int *const pos); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ShortcutListReadingUtils); + + static const ShortcutFlags FLAG_ATTRIBUTE_HAS_NEXT; + static const ShortcutFlags MASK_ATTRIBUTE_PROBABILITY; + static const int SHORTCUT_LIST_SIZE_FIELD_SIZE; + static const int WHITELIST_SHORTCUT_PROBABILITY; +}; +} // namespace latinime +#endif // LATINIME_SHORTCUT_LIST_READING_UTILS_H diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 000000000..f692882f2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; + +bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; +} + +bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, + const int codePointCount, const bool writesTerminator ,int *const pos) { + const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( + codePoints, codePointCount, writesTerminator); + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = usesAdditionalBuffer ? &mAdditionalBuffer[0] : mOriginalBuffer; + if (usesAdditionalBuffer) { + *pos -= mOriginalBufferSize; + } + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, + writesTerminator, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBufferSize; + } + return true; +} + +bool BufferWithExtendableBuffer::extendBuffer() { + const size_t sizeAfterExtending = + mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + if (sizeAfterExtending > mMaxAdditionalBufferSize) { + return false; + } + mAdditionalBuffer.resize(mAdditionalBuffer.size() + EXTEND_ADDITIONAL_BUFFER_SIZE_STEP); + return true; +} + +bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { + if (isInAdditionalBuffer(pos)) { + const int tailPosition = getTailPosition(); + if (pos == tailPosition) { + // Append data to the tail. + if (pos + size > static_cast<int>(mAdditionalBuffer.size()) + mOriginalBufferSize) { + // Need to extend buffer. + if (!extendBuffer()) { + return false; + } + } + mUsedAdditionalBufferSize += size; + } else if (pos + size > tailPosition) { + // The access will beyond the tail of used region. + return false; + } + } else { + if (pos < 0 || mOriginalBufferSize < pos + size) { + // Invalid position or violate the boundary. + return false; + } + } + return true; +} + +} diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..17d2e39c2 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include <cstddef> +#include <stdint.h> +#include <vector> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + BufferWithExtendableBuffer(uint8_t *const originalBuffer, const int originalBufferSize, + const int maxAdditionalBufferSize = MAX_ADDITIONAL_BUFFER_SIZE) + : mOriginalBuffer(originalBuffer), mOriginalBufferSize(originalBufferSize), + mAdditionalBuffer(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBufferSize + mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= mOriginalBufferSize; + } + + // TODO: Resolve the issue that the address can be changed when the vector is resized. + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return &mAdditionalBuffer[0]; + } else { + return mOriginalBuffer; + } + } + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBufferSize; + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); + + bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, + const bool writesTerminator, int *const pos); + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const size_t MAX_ADDITIONAL_BUFFER_SIZE; + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + uint8_t *const mOriginalBuffer; + const int mOriginalBufferSize; + std::vector<uint8_t> mAdditionalBuffer; + int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + bool extendBuffer(); + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + bool checkAndPrepareWriting(const int pos, const int size); +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp new file mode 100644 index 000000000..1833e8832 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h new file mode 100644 index 000000000..0c1576818 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/byte_array_utils.h @@ -0,0 +1,261 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, int *const outBuffer, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 000000000..2e4ec2e1d --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" + +#include <cstdio> +#include <cstring> + +#include "suggest/policyimpl/dictionary/header/header_policy.h" +#include "suggest/policyimpl/dictionary/dynamic_patricia_trie_writing_utils.h" +#include "suggest/policyimpl/dictionary/utils/buffer_with_extendable_buffer.h" +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + switch (dictVersion) { + case 3: + return createEmptyV3DictFile(filePath, attributeMap); + default: + // Only version 3 dictionary is supported for now. + return false; + } +} + +/* static */ bool DictFileWritingUtils::createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap) { + BufferWithExtendableBuffer headerBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + HeaderPolicy headerPolicy(FormatUtils::VERSION_3, attributeMap); + headerPolicy.writeHeaderToBuffer(&headerBuffer, true /* updatesLastUpdatedTime */); + BufferWithExtendableBuffer bodyBuffer(0 /* originalBuffer */, 0 /* originalBufferSize */); + if (!DynamicPatriciaTrieWritingUtils::writeEmptyDictionary(&bodyBuffer, 0 /* rootPos */)) { + return false; + } + return flushAllHeaderAndBodyToFile(filePath, &headerBuffer, &bodyBuffer); +} + +/* static */ bool DictFileWritingUtils::flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, BufferWithExtendableBuffer *const dictBody) { + const int tmpFileNameBufSize = strlen(filePath) + + strlen(TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE) + 1 /* terminator */; + // Name of a temporary file used for writing that is a connected string of original name and + // TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE. + char tmpFileName[tmpFileNameBufSize]; + snprintf(tmpFileName, tmpFileNameBufSize, "%s%s", filePath, + TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE); + FILE *const file = fopen(tmpFileName, "wb"); + if (!file) { + AKLOGE("Dictionary file %s cannnot be opened.", tmpFileName); + ASSERT(false); + return false; + } + // Write the dictionary header. + if (!writeBufferToFile(file, dictHeader)) { + remove(tmpFileName); + AKLOGE("Dictionary header cannnot be written. size: %d", dictHeader->getTailPosition()); + ASSERT(false); + return false; + } + // Write the dictionary body. + if (!writeBufferToFile(file, dictBody)) { + remove(tmpFileName); + AKLOGE("Dictionary body cannnot be written. size: %d", dictBody->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + rename(tmpFileName, filePath); + return true; +} + +// This closes file pointer when an error is caused and returns whether the writing was succeeded +// or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + const int additionalBufSize = buffer->getTailPosition() - buffer->getOriginalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + fclose(file); + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 000000000..bd4ac66fd --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include <cstdio> + +#include "defines.h" +#include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool flushAllHeaderAndBodyToFile(const char *const filePath, + BufferWithExtendableBuffer *const dictHeader, + BufferWithExtendableBuffer *const dictBody); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyV3DictFile(const char *const filePath, + const HeaderReadWriteUtils::AttributeMap *const attributeMap); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp new file mode 100644 index 000000000..1d77d5c27 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/policyimpl/dictionary/utils/format_utils.h" + +#include "suggest/policyimpl/dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; + +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const int FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; + +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const uint8_t *const dict, const int dictSize) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictSize < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dict, 0); + switch (magicNumber) { + case MAGIC_NUMBER: + // Version 2 header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Dictionary format version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + if (ByteArrayUtils::readUint16(dict, 4) == 2) { + return VERSION_2; + } else if (ByteArrayUtils::readUint16(dict, 4) == 3) { + return VERSION_3; + } else { + return UNKNOWN_VERSION; + } + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h new file mode 100644 index 000000000..79ed0de29 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/format_utils.h @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + */ +class FormatUtils { + public: + enum FORMAT_VERSION { + VERSION_2, + VERSION_3, + UNKNOWN_VERSION + }; + + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + + static FORMAT_VERSION detectFormatVersion(const uint8_t *const dict, const int dictSize); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); + + static const int DICTIONARY_MINIMUM_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h new file mode 100644 index 000000000..6b69116eb --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/mmapped_buffer.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include <cerrno> +#include <fcntl.h> +#include <stdint.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "defines.h" + +namespace latinime { + +class MmappedBuffer { + public: + static MmappedBuffer* openBuffer(const char *const path, const int bufferOffset, + const int bufferSize, const bool isUpdatable) { + const int openMode = isUpdatable ? O_RDWR : O_RDONLY; + const int mmapFd = open(path, openMode); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return 0; + } + const int pagesize = getpagesize(); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return 0; + } + uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return 0; + } + return new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, mmapFd, + isUpdatable); + } + + ~MmappedBuffer() { + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } + } + + AK_FORCE_INLINE uint8_t *getBuffer() const { + return mBuffer; + } + + AK_FORCE_INLINE int getBufferSize() const { + return mBufferSize; + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mBuffer(buffer), mBufferSize(bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + uint8_t *const mBuffer; + const int mBufferSize; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h new file mode 100644 index 000000000..21fe355b8 --- /dev/null +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/probability_utils.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include <stdint.h> + +#include "defines.h" + +namespace latinime { + +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. + const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp index f87989286..ecceb60d3 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp @@ -22,31 +22,36 @@ const float ScoringParams::MAX_SPATIAL_DISTANCE = 1.0f; const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY = 40; const int ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED = 120; const float ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD = 1.0f; -const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 125; +// TODO: Unlimit max cache dic node size +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE = 170; +const int ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT = 310; const int ScoringParams::THRESHOLD_SHORT_WORD_LENGTH = 4; const float ScoringParams::DISTANCE_WEIGHT_LENGTH = 0.132f; -const float ScoringParams::PROXIMITY_COST = 0.086f; -const float ScoringParams::FIRST_PROXIMITY_COST = 0.104f; +const float ScoringParams::PROXIMITY_COST = 0.095f; +const float ScoringParams::FIRST_CHAR_PROXIMITY_COST = 0.102f; +const float ScoringParams::FIRST_PROXIMITY_COST = 0.019f; const float ScoringParams::OMISSION_COST = 0.458f; const float ScoringParams::OMISSION_COST_SAME_CHAR = 0.491f; const float ScoringParams::OMISSION_COST_FIRST_CHAR = 0.582f; const float ScoringParams::INSERTION_COST = 0.730f; +const float ScoringParams::TERMINAL_INSERTION_COST = 0.93f; const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.586f; +const float ScoringParams::INSERTION_COST_PROXIMITY_CHAR = 0.70f; const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.623f; -const float ScoringParams::TRANSPOSITION_COST = 0.516f; +const float ScoringParams::TRANSPOSITION_COST = 0.526f; const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.319f; const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f; -const float ScoringParams::SUBSTITUTION_COST = 0.403f; +const float ScoringParams::SUBSTITUTION_COST = 0.383f; const float ScoringParams::COST_NEW_WORD = 0.042f; const float ScoringParams::COST_SECOND_OR_LATER_WORD_FIRST_CHAR_UPPERCASE = 0.25f; const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f; const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.545f; const float ScoringParams::COST_LOOKAHEAD = 0.073f; -const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.105f; -const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.038f; -const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.444f; +const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.093f; +const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.041f; +const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.447f; const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f; const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f; -const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.06f; +const float ScoringParams::NORMALIZED_SPATIAL_DISTANCE_THRESHOLD_FOR_EDIT = 0.045f; } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.h b/native/jni/src/suggest/policyimpl/typing/scoring_params.h index 53ac999c1..7d4b5c3c7 100644 --- a/native/jni/src/suggest/policyimpl/typing/scoring_params.h +++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.h @@ -29,6 +29,7 @@ class ScoringParams { static const int THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; static const float AUTOCORRECT_OUTPUT_THRESHOLD; static const int MAX_CACHE_DIC_NODE_SIZE; + static const int MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT; static const int THRESHOLD_SHORT_WORD_LENGTH; // Numerically optimized parameters (currently for tap typing only). @@ -36,12 +37,15 @@ class ScoringParams { // TODO: explore optimization of gesture parameters. static const float DISTANCE_WEIGHT_LENGTH; static const float PROXIMITY_COST; + static const float FIRST_CHAR_PROXIMITY_COST; static const float FIRST_PROXIMITY_COST; static const float OMISSION_COST; static const float OMISSION_COST_SAME_CHAR; static const float OMISSION_COST_FIRST_CHAR; static const float INSERTION_COST; + static const float TERMINAL_INSERTION_COST; static const float INSERTION_COST_SAME_CHAR; + static const float INSERTION_COST_PROXIMITY_CHAR; static const float INSERTION_COST_FIRST_CHAR; static const float TRANSPOSITION_COST; static const float SPACE_SUBSTITUTION_COST; diff --git a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h index 90e2133e7..56ffcc93e 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_scoring.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_scoring.h @@ -55,10 +55,10 @@ class TypingScoring : public Scoring { const int inputSize, const bool forceCommit) const { const float maxDistance = ScoringParams::DISTANCE_WEIGHT_LANGUAGE + static_cast<float>(inputSize) * ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT; - return static_cast<int>((ScoringParams::TYPING_BASE_OUTPUT_SCORE - - (compoundDistance / maxDistance) - + (forceCommit ? ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD : 0.0f)) - * SUGGEST_INTERFACE_OUTPUT_SCALE); + const float score = ScoringParams::TYPING_BASE_OUTPUT_SCORE + - compoundDistance / maxDistance + + (forceCommit ? ScoringParams::AUTOCORRECT_OUTPUT_THRESHOLD : 0.0f); + return static_cast<int>(score * SUGGEST_INTERFACE_OUTPUT_SCALE); } AK_FORCE_INLINE float getDoubleLetterDemotionDistanceCost(const int terminalIndex, diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h index 12110d54f..89e53f441 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h @@ -19,14 +19,15 @@ #include <stdint.h> -#include "char_utils.h" #include "defines.h" -#include "proximity_info_state.h" #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" +#include "suggest/core/layout/proximity_info_state.h" +#include "suggest/core/layout/proximity_info_utils.h" #include "suggest/core/policy/traversal.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" namespace latinime { class TypingTraversal : public Traversal { @@ -64,9 +65,9 @@ class TypingTraversal : public Traversal { } const int point0Index = dicNode->getInputIndex(0); const int currentBaseLowerCodePoint = - toBaseLowerCase(childDicNode->getNodeCodePoint()); + CharUtils::toBaseLowerCase(childDicNode->getNodeCodePoint()); const int typedBaseLowerCodePoint = - toBaseLowerCase(traverseSession->getProximityInfoState(0) + CharUtils::toBaseLowerCase(traverseSession->getProximityInfoState(0) ->getPrimaryCodePointAt(point0Index)); return (currentBaseLowerCodePoint != typedBaseLowerCodePoint); } @@ -136,7 +137,7 @@ class TypingTraversal : public Traversal { return ScoringParams::MAX_SPATIAL_DISTANCE; } - AK_FORCE_INLINE bool allowPartialCommit() const { + AK_FORCE_INLINE bool autoCorrectsToMultiWordSuggestionIfTop() const { return true; } @@ -147,11 +148,12 @@ class TypingTraversal : public Traversal { AK_FORCE_INLINE bool sameAsTyped( const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { return traverseSession->getProximityInfoState(0)->sameAsTyped( - dicNode->getOutputWordBuf(), dicNode->getDepth()); + dicNode->getOutputWordBuf(), dicNode->getNodeCodePointCount()); } - AK_FORCE_INLINE int getMaxCacheSize() const { - return ScoringParams::MAX_CACHE_DIC_NODE_SIZE; + AK_FORCE_INLINE int getMaxCacheSize(const int inputSize) const { + return (inputSize <= 1) ? ScoringParams::MAX_CACHE_DIC_NODE_SIZE_FOR_SINGLE_POINT + : ScoringParams::MAX_CACHE_DIC_NODE_SIZE; } AK_FORCE_INLINE bool isPossibleOmissionChildNode( @@ -159,7 +161,7 @@ class TypingTraversal : public Traversal { const DicNode *const dicNode) const { const ProximityType proximityType = getProximityType(traverseSession, parentDicNode, dicNode); - if (!DicNodeUtils::isProximityChar(proximityType)) { + if (!ProximityInfoUtils::isMatchOrProximityChar(proximityType)) { return false; } return true; @@ -171,8 +173,8 @@ class TypingTraversal : public Traversal { return false; } const int c = dicNode->getOutputWordBuf()[0]; - const bool shortCappedWord = dicNode->getDepth() - < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && isAsciiUpper(c); + const bool shortCappedWord = dicNode->getNodeCodePointCount() + < ScoringParams::THRESHOLD_SHORT_WORD_LENGTH && CharUtils::isAsciiUpper(c); return !shortCappedWord || probability >= ScoringParams::THRESHOLD_NEXT_WORD_PROBABILITY_FOR_CAPPED; } diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp index e4c69d1f6..408b12ae9 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.cpp @@ -44,6 +44,7 @@ ErrorType TypingWeighting::getErrorType(const CorrectionType correctionType, break; case CT_SUBSTITUTION: case CT_INSERTION: + case CT_TERMINAL_INSERTION: case CT_TRANSPOSITION: return ET_EDIT_CORRECTION; case CT_NEW_WORD_SPACE_OMITTION: diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h index 3938c0ec5..9f0a331e3 100644 --- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h +++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h @@ -18,11 +18,12 @@ #define LATINIME_TYPING_WEIGHTING_H #include "defines.h" -#include "suggest_utils.h" #include "suggest/core/dicnode/dic_node_utils.h" +#include "suggest/core/layout/touch_position_correction_utils.h" #include "suggest/core/policy/weighting.h" #include "suggest/core/session/dic_traverse_session.h" #include "suggest/policyimpl/typing/scoring_params.h" +#include "utils/char_utils.h" namespace latinime { @@ -54,7 +55,7 @@ class TypingWeighting : public Weighting { const bool isZeroCostOmission = parentDicNode->isZeroCostOmission(); const bool sameCodePoint = dicNode->isSameNodeCodePoint(parentDicNode); // If the traversal omitted the first letter then the dicNode should now be on the second. - const bool isFirstLetterOmission = dicNode->getDepth() == 2; + const bool isFirstLetterOmission = dicNode->getNodeCodePointCount() == 2; float cost = 0.0f; if (isZeroCostOmission) { cost = 0.0f; @@ -73,16 +74,20 @@ class TypingWeighting : public Weighting { // Note: min() required since length can be MAX_POINT_TO_KEY_LENGTH for characters not on // the keyboard (like accented letters) const float normalizedSquaredLength = traverseSession->getProximityInfoState(0) - ->getPointToKeyLength(pointIndex, dicNode->getNodeCodePoint()); - const float normalizedDistance = SuggestUtils::getSweetSpotFactor( + ->getPointToKeyLength(pointIndex, + CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); + const float normalizedDistance = TouchPositionCorrectionUtils::getSweetSpotFactor( traverseSession->isTouchPositionCorrectionEnabled(), normalizedSquaredLength); const float weightedDistance = ScoringParams::DISTANCE_WEIGHT_LENGTH * normalizedDistance; const bool isFirstChar = pointIndex == 0; const bool isProximity = isProximityDicNode(traverseSession, dicNode); - float cost = isProximity ? (isFirstChar ? ScoringParams::FIRST_PROXIMITY_COST + float cost = isProximity ? (isFirstChar ? ScoringParams::FIRST_CHAR_PROXIMITY_COST : ScoringParams::PROXIMITY_COST) : 0.0f; - if (dicNode->getDepth() == 2) { + if (isProximity && dicNode->getProximityCorrectionCount() == 0) { + cost += ScoringParams::FIRST_PROXIMITY_COST; + } + if (dicNode->getNodeCodePointCount() == 2) { // At the second character of the current word, we check if the first char is uppercase // and the word is a second or later word of a multiple word suggestion. We demote it // if so. @@ -98,9 +103,9 @@ class TypingWeighting : public Weighting { bool isProximityDicNode(const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const { const int pointIndex = dicNode->getInputIndex(0); - const int primaryCodePoint = toBaseLowerCase( + const int primaryCodePoint = CharUtils::toBaseLowerCase( traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(pointIndex)); - const int dicNodeChar = toBaseLowerCase(dicNode->getNodeCodePoint()); + const int dicNodeChar = CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint()); return primaryCodePoint != dicNodeChar; } @@ -109,10 +114,10 @@ class TypingWeighting : public Weighting { const int16_t parentPointIndex = parentDicNode->getInputIndex(0); const int prevCodePoint = parentDicNode->getNodeCodePoint(); const float distance1 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - parentPointIndex + 1, prevCodePoint); + parentPointIndex + 1, CharUtils::toBaseLowerCase(prevCodePoint)); const int codePoint = dicNode->getNodeCodePoint(); const float distance2 = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - parentPointIndex, codePoint); + parentPointIndex, CharUtils::toBaseLowerCase(codePoint)); const float distance = distance1 + distance2; const float weightedLengthDistance = distance * ScoringParams::DISTANCE_WEIGHT_LENGTH; @@ -121,31 +126,38 @@ class TypingWeighting : public Weighting { float getInsertionCost(const DicTraverseSession *const traverseSession, const DicNode *const parentDicNode, const DicNode *const dicNode) const { - const int16_t parentPointIndex = parentDicNode->getInputIndex(0); - const int prevCodePoint = - traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt(parentPointIndex); - + const int16_t insertedPointIndex = parentDicNode->getInputIndex(0); + const int prevCodePoint = traverseSession->getProximityInfoState(0)->getPrimaryCodePointAt( + insertedPointIndex); const int currentCodePoint = dicNode->getNodeCodePoint(); const bool sameCodePoint = prevCodePoint == currentCodePoint; + const bool existsAdjacentProximityChars = traverseSession->getProximityInfoState(0) + ->existsAdjacentProximityChars(insertedPointIndex); const float dist = traverseSession->getProximityInfoState(0)->getPointToKeyLength( - parentPointIndex + 1, currentCodePoint); + insertedPointIndex + 1, CharUtils::toBaseLowerCase(dicNode->getNodeCodePoint())); const float weightedDistance = dist * ScoringParams::DISTANCE_WEIGHT_LENGTH; - const bool singleChar = dicNode->getDepth() == 1; - const float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f) - + (sameCodePoint ? ScoringParams::INSERTION_COST_SAME_CHAR - : ScoringParams::INSERTION_COST); + const bool singleChar = dicNode->getNodeCodePointCount() == 1; + float cost = (singleChar ? ScoringParams::INSERTION_COST_FIRST_CHAR : 0.0f); + if (sameCodePoint) { + cost += ScoringParams::INSERTION_COST_SAME_CHAR; + } else if (existsAdjacentProximityChars) { + cost += ScoringParams::INSERTION_COST_PROXIMITY_CHAR; + } else { + cost += ScoringParams::INSERTION_COST; + } return cost + weightedDistance; } - float getNewWordCost(const DicTraverseSession *const traverseSession, - const DicNode *const dicNode) const { + float getNewWordSpatialCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode, DicNode_InputStateG *inputStateG) const { return ScoringParams::COST_NEW_WORD * traverseSession->getMultiWordCostMultiplier(); } - float getNewWordBigramCost(const DicTraverseSession *const traverseSession, + float getNewWordBigramLanguageCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, MultiBigramMap *const multiBigramMap) const { - return DicNodeUtils::getBigramNodeImprobability(traverseSession->getOffsetDict(), + return DicNodeUtils::getBigramNodeImprobability( + traverseSession->getDictionaryStructurePolicy(), dicNode, multiBigramMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; } @@ -162,9 +174,16 @@ class TypingWeighting : public Weighting { float getTerminalLanguageCost(const DicTraverseSession *const traverseSession, const DicNode *const dicNode, const float dicNodeLanguageImprobability) const { - const float languageImprobability = (dicNode->isExactMatch()) ? - 0.0f : dicNodeLanguageImprobability; - return languageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + return dicNodeLanguageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE; + } + + float getTerminalInsertionCost(const DicTraverseSession *const traverseSession, + const DicNode *const dicNode) const { + const int inputIndex = dicNode->getInputIndex(0); + const int inputSize = traverseSession->getInputSize(); + ASSERT(inputIndex < inputSize); + // TODO: Implement more efficient logic + return ScoringParams::TERMINAL_INSERTION_COST * (inputSize - inputIndex); } AK_FORCE_INLINE bool needsToNormalizeCompoundDistance() const { diff --git a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h index ec1457455..81614bc9c 100644 --- a/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h +++ b/native/jni/src/suggest/policyimpl/utils/damerau_levenshtein_edit_distance_policy.h @@ -17,8 +17,8 @@ #ifndef LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H #define LATINIME_DAEMARU_LEVENSHTEIN_EDIT_DISTANCE_POLICY_H -#include "char_utils.h" #include "suggest/policyimpl/utils/edit_distance_policy.h" +#include "utils/char_utils.h" namespace latinime { @@ -31,8 +31,8 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { ~DamerauLevenshteinEditDistancePolicy() {} AK_FORCE_INLINE float getSubstitutionCost(const int index0, const int index1) const { - const int c0 = toBaseLowerCase(mString0[index0]); - const int c1 = toBaseLowerCase(mString1[index1]); + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); return (c0 == c1) ? 0.0f : 1.0f; } @@ -45,10 +45,10 @@ class DamerauLevenshteinEditDistancePolicy : public EditDistancePolicy { } AK_FORCE_INLINE bool allowTransposition(const int index0, const int index1) const { - const int c0 = toBaseLowerCase(mString0[index0]); - const int c1 = toBaseLowerCase(mString1[index1]); - if (index0 > 0 && index1 > 0 && c0 == toBaseLowerCase(mString1[index1 - 1]) - && c1 == toBaseLowerCase(mString0[index0 - 1])) { + const int c0 = CharUtils::toBaseLowerCase(mString0[index0]); + const int c1 = CharUtils::toBaseLowerCase(mString1[index1]); + if (index0 > 0 && index1 > 0 && c0 == CharUtils::toBaseLowerCase(mString1[index1 - 1]) + && c1 == CharUtils::toBaseLowerCase(mString0[index0 - 1])) { return true; } return false; diff --git a/native/jni/src/suggest/policyimpl/utils/edit_distance.h b/native/jni/src/suggest/policyimpl/utils/edit_distance.h index cbbd66894..0871c37ce 100644 --- a/native/jni/src/suggest/policyimpl/utils/edit_distance.h +++ b/native/jni/src/suggest/policyimpl/utils/edit_distance.h @@ -62,6 +62,26 @@ class EditDistance { return dp[(beforeLength + 1) * (afterLength + 1) - 1]; } + AK_FORCE_INLINE static void dumpEditDistance10ForDebug(const float *const editDistanceTable, + const int editDistanceTableWidth, const int outputLength) { + if (DEBUG_DICT) { + AKLOGI("EditDistanceTable"); + for (int i = 0; i <= 10; ++i) { + float c[11]; + for (int j = 0; j <= 10; ++j) { + if (j < editDistanceTableWidth + 1 && i < outputLength + 1) { + c[j] = (editDistanceTable + i * (editDistanceTableWidth + 1))[j]; + } else { + c[j] = -1.0f; + } + } + AKLOGI("[ %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f ]", + c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7], c[8], c[9], c[10]); + (void)c; // To suppress compiler warning + } + } + } + private: DISALLOW_IMPLICIT_CONSTRUCTORS(EditDistance); }; |