diff options
Diffstat (limited to 'native/jni/src/dictionary/utils')
26 files changed, 3286 insertions, 0 deletions
diff --git a/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h new file mode 100644 index 000000000..8a614730b --- /dev/null +++ b/native/jni/src/dictionary/utils/binary_dictionary_bigrams_iterator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_bigrams_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryBigramsIterator { + public: + // Empty iterator. + BinaryDictionaryBigramsIterator() + : mBigramsStructurePolicy(nullptr), mPos(NOT_A_DICT_POS), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), mHasNext(false) {} + + BinaryDictionaryBigramsIterator( + const DictionaryBigramsStructurePolicy *const bigramsStructurePolicy, const int pos) + : mBigramsStructurePolicy(bigramsStructurePolicy), mPos(pos), + mBigramPos(NOT_A_DICT_POS), mProbability(NOT_A_PROBABILITY), + mHasNext(pos != NOT_A_DICT_POS) {} + + BinaryDictionaryBigramsIterator(BinaryDictionaryBigramsIterator &&bigramsIterator) + : mBigramsStructurePolicy(bigramsIterator.mBigramsStructurePolicy), + mPos(bigramsIterator.mPos), mBigramPos(bigramsIterator.mBigramPos), + mProbability(bigramsIterator.mProbability), mHasNext(bigramsIterator.mHasNext) {} + + AK_FORCE_INLINE bool hasNext() const { + return mHasNext; + } + + AK_FORCE_INLINE void next() { + mBigramsStructurePolicy->getNextBigram(&mBigramPos, &mProbability, &mHasNext, &mPos); + } + + AK_FORCE_INLINE int getProbability() const { + return mProbability; + } + + AK_FORCE_INLINE int getBigramPos() const { + return mBigramPos; + } + + private: + DISALLOW_COPY_AND_ASSIGN(BinaryDictionaryBigramsIterator); + + const DictionaryBigramsStructurePolicy *const mBigramsStructurePolicy; + int mPos; + int mBigramPos; + int mProbability; + bool mHasNext; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_BIGRAMS_ITERATOR_H diff --git a/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h new file mode 100644 index 000000000..a4ddd58c2 --- /dev/null +++ b/native/jni/src/dictionary/utils/binary_dictionary_shortcut_iterator.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H +#define LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H + +#include "defines.h" +#include "dictionary/interface/dictionary_shortcuts_structure_policy.h" + +namespace latinime { + +class BinaryDictionaryShortcutIterator { + public: + BinaryDictionaryShortcutIterator( + const DictionaryShortcutsStructurePolicy *const shortcutStructurePolicy, + const int shortcutPos) + : mShortcutStructurePolicy(shortcutStructurePolicy), + mPos(shortcutStructurePolicy->getStartPos(shortcutPos)), + mHasNextShortcutTarget(shortcutPos != NOT_A_DICT_POS) {} + + BinaryDictionaryShortcutIterator(const BinaryDictionaryShortcutIterator &&shortcutIterator) + : mShortcutStructurePolicy(shortcutIterator.mShortcutStructurePolicy), + mPos(shortcutIterator.mPos), + mHasNextShortcutTarget(shortcutIterator.mHasNextShortcutTarget) {} + + AK_FORCE_INLINE bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as an int string and put it to outTarget, put its length + // to outTargetLength, put whether it is whitelist to outIsWhitelist. + AK_FORCE_INLINE void nextShortcutTarget( + const int maxDepth, int *const outTarget, int *const outTargetLength, + bool *const outIsWhitelist) { + mShortcutStructurePolicy->getNextShortcut(maxDepth, outTarget, outTargetLength, + outIsWhitelist, &mHasNextShortcutTarget, &mPos); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(BinaryDictionaryShortcutIterator); + DISALLOW_ASSIGNMENT_OPERATOR(BinaryDictionaryShortcutIterator); + + const DictionaryShortcutsStructurePolicy *const mShortcutStructurePolicy; + int mPos; + bool mHasNextShortcutTarget; +}; +} // namespace latinime +#endif // LATINIME_BINARY_DICTIONARY_SHORTCUT_ITERATOR_H diff --git a/native/jni/src/dictionary/utils/bloom_filter.h b/native/jni/src/dictionary/utils/bloom_filter.h new file mode 100644 index 000000000..1e60f49ed --- /dev/null +++ b/native/jni/src/dictionary/utils/bloom_filter.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BLOOM_FILTER_H +#define LATINIME_BLOOM_FILTER_H + +#include <bitset> + +#include "defines.h" + +namespace latinime { + +// This bloom filter is used for optimizing bigram retrieval. +// Execution times with previous word "this" are as follows: +// without bloom filter (use only hash_map): +// Total 147792.34 (sum of others 147771.57) +// with bloom filter: +// Total 145900.64 (sum of others 145874.30) +// always read binary dictionary: +// Total 148603.14 (sum of others 148579.90) +class BloomFilter { + public: + BloomFilter() : mFilter() {} + + AK_FORCE_INLINE void setInFilter(const int position) { + mFilter.set(getIndex(position)); + } + + AK_FORCE_INLINE bool isInFilter(const int position) const { + return mFilter.test(getIndex(position)); + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter); + + AK_FORCE_INLINE size_t getIndex(const int position) const { + return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO; + } + + // Size, in bits, of the bloom filter index for bigrams + // The probability of false positive is (1 - e ** (-kn/m))**k, + // where k is the number of hash functions, n the number of bigrams, and m the number of + // bits we can test. + // At the moment 100 is the maximum number of bigrams for a word with the current main + // dictionaries, so n = 100. 1024 buckets give us m = 1024. + // With 1 hash function, our false positive rate is about 9.3%, which should be enough for + // our uses since we are only using this to increase average performance. For the record, + // k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%, + // and m = 4096 gives 2.4%. + // This is assigned here because it is used for bitset size. + // 1021 is the largest prime under 1024. + static const size_t BIGRAM_FILTER_MODULO = 1021; + std::bitset<BIGRAM_FILTER_MODULO> mFilter; +}; +} // namespace latinime +#endif // LATINIME_BLOOM_FILTER_H diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp new file mode 100644 index 000000000..217569651 --- /dev/null +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +const size_t BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE = 1024 * 1024; +const int BufferWithExtendableBuffer::NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE = 90; +// TODO: Needs to allocate larger memory corresponding to the current vector size. +const size_t BufferWithExtendableBuffer::EXTEND_ADDITIONAL_BUFFER_SIZE_STEP = 128 * 1024; + +uint32_t BufferWithExtendableBuffer::readUint(const int size, const int pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(pos); + const int posInBuffer = readingPosIsInAdditionalBuffer ? pos - mOriginalBuffer.size() : pos; + return ByteArrayUtils::readUint(getBuffer(readingPosIsInAdditionalBuffer), size, posInBuffer); +} + +uint32_t BufferWithExtendableBuffer::readUintAndAdvancePosition(const int size, + int *const pos) const { + const uint32_t value = readUint(size, *pos); + *pos += size; + return value; +} + +void BufferWithExtendableBuffer::readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const { + const bool readingPosIsInAdditionalBuffer = isInAdditionalBuffer(*pos); + if (readingPosIsInAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + // Code point table is not used for dynamic format. + *outCodePointCount = ByteArrayUtils::readStringAndAdvancePosition( + getBuffer(readingPosIsInAdditionalBuffer), maxCodePointCount, + nullptr /* codePointTable */, outCodePoints, pos); + if (readingPosIsInAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } +} + +bool BufferWithExtendableBuffer::extend(const int size) { + return checkAndPrepareWriting(getTailPosition(), size); +} + +bool BufferWithExtendableBuffer::writeUint(const uint32_t data, const int size, const int pos) { + int writingPos = pos; + return writeUintAndAdvancePosition(data, size, &writingPos); +} + +bool BufferWithExtendableBuffer::writeUintAndAdvancePosition(const uint32_t data, const int size, + int *const pos) { + if (!(size >= 1 && size <= 4)) { + AKLOGI("writeUintAndAdvancePosition() is called with invalid size: %d", size); + ASSERT(false); + return false; + } + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeUintAndAdvancePosition(buffer, data, size, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::writeCodePointsAndAdvancePosition(const int *const codePoints, + const int codePointCount, const bool writesTerminator, int *const pos) { + const size_t size = ByteArrayUtils::calculateRequiredByteCountToStoreCodePoints( + codePoints, codePointCount, writesTerminator); + if (!checkAndPrepareWriting(*pos, size)) { + return false; + } + const bool usesAdditionalBuffer = isInAdditionalBuffer(*pos); + uint8_t *const buffer = + usesAdditionalBuffer ? mAdditionalBuffer.data() : mOriginalBuffer.data(); + if (usesAdditionalBuffer) { + *pos -= mOriginalBuffer.size(); + } + ByteArrayUtils::writeCodePointsAndAdvancePosition(buffer, codePoints, codePointCount, + writesTerminator, pos); + if (usesAdditionalBuffer) { + *pos += mOriginalBuffer.size(); + } + return true; +} + +bool BufferWithExtendableBuffer::extendBuffer(const size_t size) { + const size_t extendSize = std::max(EXTEND_ADDITIONAL_BUFFER_SIZE_STEP, size); + const size_t sizeAfterExtending = + std::min(mAdditionalBuffer.size() + extendSize, mMaxAdditionalBufferSize); + if (sizeAfterExtending < mAdditionalBuffer.size() + size) { + return false; + } + mAdditionalBuffer.resize(sizeAfterExtending); + return true; +} + +bool BufferWithExtendableBuffer::checkAndPrepareWriting(const int pos, const int size) { + if (pos < 0 || size < 0) { + // Invalid position or size. + return false; + } + const size_t totalRequiredSize = static_cast<size_t>(pos + size); + if (!isInAdditionalBuffer(pos)) { + // Here don't need to care about the additional buffer. + if (mOriginalBuffer.size() < totalRequiredSize) { + // Violate the boundary. + return false; + } + // The buffer has sufficient capacity. + return true; + } + // Hereafter, pos is in the additional buffer. + const size_t tailPosition = static_cast<size_t>(getTailPosition()); + if (totalRequiredSize <= tailPosition) { + // The buffer has sufficient capacity. + return true; + } + if (static_cast<size_t>(pos) != tailPosition) { + // The additional buffer must be extended from the tail position. + return false; + } + const size_t extendSize = totalRequiredSize - + std::min(mAdditionalBuffer.size() + mOriginalBuffer.size(), totalRequiredSize); + if (extendSize > 0 && !extendBuffer(extendSize)) { + // Failed to extend the buffer. + return false; + } + mUsedAdditionalBufferSize += size; + return true; +} + +bool BufferWithExtendableBuffer::copy(const BufferWithExtendableBuffer *const sourceBuffer) { + int copyingPos = 0; + const int tailPos = sourceBuffer->getTailPosition(); + const int maxDataChunkSize = sizeof(uint32_t); + while (copyingPos < tailPos) { + const int remainingSize = tailPos - copyingPos; + const int copyingSize = (remainingSize >= maxDataChunkSize) ? + maxDataChunkSize : remainingSize; + const uint32_t data = sourceBuffer->readUint(copyingSize, copyingPos); + if (!writeUint(data, copyingSize, copyingPos)) { + return false; + } + copyingPos += copyingSize; + } + return true; +} + +} diff --git a/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h new file mode 100644 index 000000000..0a141d4db --- /dev/null +++ b/native/jni/src/dictionary/utils/buffer_with_extendable_buffer.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H +#define LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H + +#include <cstddef> +#include <cstdint> +#include <vector> + +#include "defines.h" +#include "dictionary/utils/byte_array_utils.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +// This is used as a buffer that can be extended for updatable dictionaries. +// To optimize performance, raw pointer is directly used for reading buffer. The position has to be +// adjusted to access additional buffer. On the other hand, this class does not provide writable +// raw pointer but provides several methods that handle boundary checking for writing data. +class BufferWithExtendableBuffer { + public: + static const size_t DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE; + + BufferWithExtendableBuffer(const ReadWriteByteArrayView originalBuffer, + const int maxAdditionalBufferSize) + : mOriginalBuffer(originalBuffer), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + // Without original buffer. + BufferWithExtendableBuffer(const int maxAdditionalBufferSize) + : mOriginalBuffer(), mAdditionalBuffer(), mUsedAdditionalBufferSize(0), + mMaxAdditionalBufferSize(maxAdditionalBufferSize) {} + + AK_FORCE_INLINE int getTailPosition() const { + return mOriginalBuffer.size() + mUsedAdditionalBufferSize; + } + + AK_FORCE_INLINE int getUsedAdditionalBufferSize() const { + return mUsedAdditionalBufferSize; + } + + /** + * For reading. + */ + AK_FORCE_INLINE bool isInAdditionalBuffer(const int position) const { + return position >= static_cast<int>(mOriginalBuffer.size()); + } + + // TODO: Resolve the issue that the address can be changed when the vector is resized. + // CAVEAT!: Be careful about array out of bound access with buffers + AK_FORCE_INLINE const uint8_t *getBuffer(const bool usesAdditionalBuffer) const { + if (usesAdditionalBuffer) { + return mAdditionalBuffer.data(); + } else { + return mOriginalBuffer.data(); + } + } + + uint32_t readUint(const int size, const int pos) const; + + uint32_t readUintAndAdvancePosition(const int size, int *const pos) const; + + void readCodePointsAndAdvancePosition(const int maxCodePointCount, + int *const outCodePoints, int *outCodePointCount, int *const pos) const; + + AK_FORCE_INLINE int getOriginalBufferSize() const { + return mOriginalBuffer.size(); + } + + AK_FORCE_INLINE bool isNearSizeLimit() const { + return mAdditionalBuffer.size() >= ((mMaxAdditionalBufferSize + * NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE) / 100); + } + + bool extend(const int size); + + /** + * For writing. + * + * Writing is allowed for original buffer, already written region of additional buffer and the + * tail of additional buffer. + */ + bool writeUint(const uint32_t data, const int size, const int pos); + + bool writeUintAndAdvancePosition(const uint32_t data, const int size, int *const pos); + + bool writeCodePointsAndAdvancePosition(const int *const codePoints, const int codePointCount, + const bool writesTerminator, int *const pos); + + bool copy(const BufferWithExtendableBuffer *const sourceBuffer); + + private: + DISALLOW_COPY_AND_ASSIGN(BufferWithExtendableBuffer); + + static const int NEAR_BUFFER_LIMIT_THRESHOLD_PERCENTILE; + static const size_t EXTEND_ADDITIONAL_BUFFER_SIZE_STEP; + + const ReadWriteByteArrayView mOriginalBuffer; + std::vector<uint8_t> mAdditionalBuffer; + int mUsedAdditionalBufferSize; + const size_t mMaxAdditionalBufferSize; + + // Return if the buffer is successfully extended or not. + bool extendBuffer(const size_t size); + + // Returns if it is possible to write size-bytes from pos. When pos is at the tail position of + // the additional buffer, try extending the buffer. + bool checkAndPrepareWriting(const int pos, const int size); +}; +} +#endif /* LATINIME_BUFFER_WITH_EXTENDABLE_BUFFER_H */ diff --git a/native/jni/src/dictionary/utils/byte_array_utils.cpp b/native/jni/src/dictionary/utils/byte_array_utils.cpp new file mode 100644 index 000000000..d38f08217 --- /dev/null +++ b/native/jni/src/dictionary/utils/byte_array_utils.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint8_t ByteArrayUtils::MINIMUM_ONE_BYTE_CHARACTER_VALUE = 0x20; +const uint8_t ByteArrayUtils::MAXIMUM_ONE_BYTE_CHARACTER_VALUE = 0xFF; +const uint8_t ByteArrayUtils::CHARACTER_ARRAY_TERMINATOR = 0x1F; + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h new file mode 100644 index 000000000..abb979050 --- /dev/null +++ b/native/jni/src/dictionary/utils/byte_array_utils.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include <cstdint> + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, + const int size, const int pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + return ByteArrayUtils::readUint8(buffer, pos); + case 2: + return ByteArrayUtils::readUint16(buffer, pos); + case 3: + return ByteArrayUtils::readUint24(buffer, pos); + case 4: + return ByteArrayUtils::readUint32(buffer, pos); + default: + return 0; + } + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp new file mode 100644 index 000000000..033a758ba --- /dev/null +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/dict_file_writing_utils.h" + +#include <cstdio> +#include <errno.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "dictionary/header/header_policy.h" +#include "dictionary/structure/backward/v402/ver4_dict_buffers.h" +#include "dictionary/structure/pt_common/dynamic_pt_writing_utils.h" +#include "dictionary/structure/v4/ver4_dict_buffers.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "dictionary/utils/entry_counters.h" +#include "dictionary/utils/file_utils.h" +#include "dictionary/utils/format_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = ".tmp"; +// Enough size to describe buffer size. +const int DictFileWritingUtils::SIZE_OF_BUFFER_SIZE_FIELD = 4; + +/* static */ bool DictFileWritingUtils::createEmptyDictFile(const char *const filePath, + const int dictVersion, const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) { + TimeKeeper::setCurrentTime(); + const FormatUtils::FORMAT_VERSION formatVersion = FormatUtils::getFormatVersion(dictVersion); + switch (formatVersion) { + case FormatUtils::VERSION_402: + return createEmptyV4DictFile<backward::v402::Ver4DictConstants, + backward::v402::Ver4DictBuffers, + backward::v402::Ver4DictBuffers::Ver4DictBuffersPtr>( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + case FormatUtils::VERSION_4_ONLY_FOR_TESTING: + case FormatUtils::VERSION_403: + return createEmptyV4DictFile<Ver4DictConstants, Ver4DictBuffers, + Ver4DictBuffers::Ver4DictBuffersPtr>( + filePath, localeAsCodePointVector, attributeMap, formatVersion); + default: + AKLOGE("Cannot create dictionary %s because format version %d is not supported.", + filePath, dictVersion); + return false; + } +} + +template<class DictConstants, class DictBuffers, class DictBuffersPtr> +/* static */ bool DictFileWritingUtils::createEmptyV4DictFile(const char *const dirPath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion) { + HeaderPolicy headerPolicy(formatVersion, localeAsCodePointVector, attributeMap); + DictBuffersPtr dictBuffers = DictBuffers::createVer4DictBuffers(&headerPolicy, + DictConstants::MAX_DICT_EXTENDED_REGION_SIZE); + headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + EntryCounts(), 0 /* extendedRegionSize */, dictBuffers->getWritableHeaderBuffer()); + if (!DynamicPtWritingUtils::writeEmptyDictionary( + dictBuffers->getWritableTrieBuffer(), 0 /* rootPos */)) { + AKLOGE("Empty ver4 dictionary structure cannot be created on memory."); + return false; + } + return dictBuffers->flush(dirPath); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFileWithSuffix(const char *const basePath, + const char *const suffix, const BufferWithExtendableBuffer *const buffer) { + const int filePathBufSize = FileUtils::getFilePathWithSuffixBufSize(basePath, suffix); + char filePath[filePathBufSize]; + FileUtils::getFilePathWithSuffix(basePath, suffix, filePathBufSize, filePath); + return flushBufferToFile(filePath, buffer); +} + +/* static */ bool DictFileWritingUtils::writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + uint8_t bufferSize[SIZE_OF_BUFFER_SIZE_FIELD]; + int writingPos = 0; + ByteArrayUtils::writeUintAndAdvancePosition(bufferSize, buffer->getTailPosition(), + SIZE_OF_BUFFER_SIZE_FIELD, &writingPos); + if (fwrite(bufferSize, SIZE_OF_BUFFER_SIZE_FIELD, 1 /* count */, file) < 1) { + return false; + } + return writeBufferToFile(file, buffer); +} + +/* static */ bool DictFileWritingUtils::flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer) { + const int fd = open(filePath, O_WRONLY | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR); + if (fd == -1) { + AKLOGE("File %s cannot be opened. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + FILE *const file = fdopen(fd, "wb"); + if (!file) { + AKLOGE("fdopen failed for the file %s. errno: %d", filePath, errno); + ASSERT(false); + return false; + } + if (!writeBufferToFile(file, buffer)) { + fclose(file); + remove(filePath); + AKLOGE("Buffer cannot be written to the file %s. size: %d", filePath, + buffer->getTailPosition()); + ASSERT(false); + return false; + } + fclose(file); + return true; +} + +// Returns whether the writing was succeeded or not. +/* static */ bool DictFileWritingUtils::writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer) { + const int originalBufSize = buffer->getOriginalBufferSize(); + if (originalBufSize > 0 && fwrite(buffer->getBuffer(false /* usesAdditionalBuffer */), + originalBufSize, 1, file) < 1) { + return false; + } + const int additionalBufSize = buffer->getUsedAdditionalBufferSize(); + if (additionalBufSize > 0 && fwrite(buffer->getBuffer(true /* usesAdditionalBuffer */), + additionalBufSize, 1, file) < 1) { + return false; + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/dict_file_writing_utils.h b/native/jni/src/dictionary/utils/dict_file_writing_utils.h new file mode 100644 index 000000000..102a89da4 --- /dev/null +++ b/native/jni/src/dictionary/utils/dict_file_writing_utils.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_FILE_WRITING_UTILS_H +#define LATINIME_DICT_FILE_WRITING_UTILS_H + +#include <cstdio> + +#include "defines.h" +#include "dictionary/header/header_read_write_utils.h" +#include "dictionary/utils/format_utils.h" + +namespace latinime { + +class BufferWithExtendableBuffer; + +class DictFileWritingUtils { + public: + static const char *const TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE; + + static bool createEmptyDictFile(const char *const filePath, const int dictVersion, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap); + + static bool flushBufferToFileWithSuffix(const char *const basePath, const char *const suffix, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFileTail(FILE *const file, + const BufferWithExtendableBuffer *const buffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(DictFileWritingUtils); + + static const int SIZE_OF_BUFFER_SIZE_FIELD; + + static bool createEmptyV401DictFile(const char *const filePath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + template<class DictConstants, class DictBuffers, class DictBuffersPtr> + static bool createEmptyV4DictFile(const char *const filePath, + const std::vector<int> localeAsCodePointVector, + const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap, + const FormatUtils::FORMAT_VERSION formatVersion); + + static bool flushBufferToFile(const char *const filePath, + const BufferWithExtendableBuffer *const buffer); + + static bool writeBufferToFile(FILE *const file, + const BufferWithExtendableBuffer *const buffer); +}; +} // namespace latinime +#endif /* LATINIME_DICT_FILE_WRITING_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/entry_counters.h b/native/jni/src/dictionary/utils/entry_counters.h new file mode 100644 index 000000000..5e443026e --- /dev/null +++ b/native/jni/src/dictionary/utils/entry_counters.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_ENTRY_COUNTERS_H +#define LATINIME_ENTRY_COUNTERS_H + +#include <array> + +#include "defines.h" +#include "utils/ngram_utils.h" + +namespace latinime { + +// Copyable but immutable +class EntryCounts final { + public: + EntryCounts() : mEntryCounts({{0, 0, 0, 0}}) {} + + explicit EntryCounts(const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters) + : mEntryCounts(counters) {} + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounts[static_cast<int>(ngramType)]; + } + + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &getCountArray() const { + return mEntryCounts; + } + + private: + DISALLOW_ASSIGNMENT_OPERATOR(EntryCounts); + + // Counts from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounts; +}; + +class MutableEntryCounters final { + public: + MutableEntryCounters() { + mEntryCounters.fill(0); + } + + explicit MutableEntryCounters( + const std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> &counters) + : mEntryCounters(counters) {} + + const EntryCounts getEntryCounts() const { + return EntryCounts(mEntryCounters); + } + + void incrementNgramCount(const NgramType ngramType) { + ++mEntryCounters[static_cast<int>(ngramType)]; + } + + void decrementNgramCount(const NgramType ngramType) { + --mEntryCounters[static_cast<int>(ngramType)]; + } + + int getNgramCount(const NgramType ngramType) const { + return mEntryCounters[static_cast<int>(ngramType)]; + } + + void setNgramCount(const NgramType ngramType, const int count) { + mEntryCounters[static_cast<int>(ngramType)] = count; + } + + private: + DISALLOW_COPY_AND_ASSIGN(MutableEntryCounters); + + // Counters from Unigram (0-th element) to (MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1)-gram + // (MAX_PREV_WORD_COUNT_FOR_N_GRAM-th element) + std::array<int, MAX_PREV_WORD_COUNT_FOR_N_GRAM + 1> mEntryCounters; +}; +} // namespace latinime +#endif /* LATINIME_ENTRY_COUNTERS_H */ diff --git a/native/jni/src/dictionary/utils/file_utils.cpp b/native/jni/src/dictionary/utils/file_utils.cpp new file mode 100644 index 000000000..bb392fb32 --- /dev/null +++ b/native/jni/src/dictionary/utils/file_utils.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/file_utils.h" + +#include <cstdio> +#include <cstring> +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace latinime { + +// Returns -1 on error. +/* static */ int FileUtils::getFileSize(const char *const filePath) { + const int fd = open(filePath, O_RDONLY); + if (fd == -1) { + return -1; + } + struct stat statBuf; + if (fstat(fd, &statBuf) != 0) { + close(fd); + return -1; + } + close(fd); + return static_cast<int>(statBuf.st_size); +} + +/* static */ bool FileUtils::existsDir(const char *const dirPath) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + return false; + } + closedir(dir); + return true; +} + +// Remove a directory and all files in the directory. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath) { + return removeDirAndFiles(dirPath, 5 /* maxTries */); +} + +// Remove a directory and all files in the directory, trying up to maxTimes. +/* static */ bool FileUtils::removeDirAndFiles(const char *const dirPath, const int maxTries) { + DIR *const dir = opendir(dirPath); + if (dir == NULL) { + AKLOGE("Cannot open dir %s.", dirPath); + return true; + } + struct dirent *dirent; + while ((dirent = readdir(dir)) != NULL) { + if (dirent->d_type == DT_DIR) { + continue; + } + if (strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0) { + continue; + } + const int filePathBufSize = getFilePathBufSize(dirPath, dirent->d_name); + char filePath[filePathBufSize]; + getFilePath(dirPath, dirent->d_name, filePathBufSize, filePath); + if (remove(filePath) != 0) { + AKLOGE("Cannot remove file %s.", filePath); + closedir(dir); + return false; + } + } + closedir(dir); + if (remove(dirPath) != 0) { + if (maxTries > 0) { + // On NFS, deleting files sometimes creates new files. I'm not sure what the + // correct way of dealing with this is, but for the time being, this seems to work. + removeDirAndFiles(dirPath, maxTries - 1); + } else { + AKLOGE("Cannot remove directory %s.", dirPath); + return false; + } + } + return true; +} + +/* static */ int FileUtils::getFilePathWithSuffixBufSize(const char *const filePath, + const char *const suffix) { + return strlen(filePath) + strlen(suffix) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePathWithSuffix(const char *const filePath, + const char *const suffix, const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s%s", filePath, suffix); +} + +/* static */ int FileUtils::getFilePathBufSize(const char *const dirPath, + const char *const fileName) { + return strlen(dirPath) + 1 /* '/' */ + strlen(fileName) + 1 /* terminator */; +} + +/* static */ void FileUtils::getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath) { + snprintf(outFilePath, filePathBufSize, "%s/%s", dirPath, fileName); +} + +/* static */ bool FileUtils::getFilePathWithoutSuffix(const char *const filePath, + const char *const suffix, const int outDirPathBufSize, char *const outDirPath) { + const int filePathLength = strlen(filePath); + const int suffixLength = strlen(suffix); + if (filePathLength <= suffixLength) { + AKLOGE("File path length (%s:%d) is shorter that suffix length (%s:%d).", + filePath, filePathLength, suffix, suffixLength); + return false; + } + const int resultFilePathLength = filePathLength - suffixLength; + if (outDirPathBufSize <= resultFilePathLength) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, suffix: %s, outDirPathBufSize: %d", + filePath, suffix, outDirPathBufSize); + return false; + } + if (strncmp(filePath + resultFilePathLength, suffix, suffixLength) != 0) { + AKLOGE("File Path %s does not have %s as a suffix", filePath, suffix); + return false; + } + snprintf(outDirPath, resultFilePathLength + 1 /* terminator */, "%s", filePath); + return true; +} + +/* static */ void FileUtils::getDirPath(const char *const filePath, const int outDirPathBufSize, + char *const outDirPath) { + for (int i = strlen(filePath) - 1; i >= 0; --i) { + if (filePath[i] == '/') { + if (i >= outDirPathBufSize) { + AKLOGE("outDirPathBufSize is too small. filePath: %s, outDirPathBufSize: %d", + filePath, outDirPathBufSize); + ASSERT(false); + return; + } + snprintf(outDirPath, i + 1 /* terminator */, "%s", filePath); + return; + } + } +} + +/* static */ void FileUtils::getBasename(const char *const filePath, + const int outNameBufSize, char *const outName) { + const int filePathBufSize = strlen(filePath) + 1 /* terminator */; + char filePathBuf[filePathBufSize]; + snprintf(filePathBuf, filePathBufSize, "%s", filePath); + const char *const baseName = basename(filePathBuf); + const int baseNameLength = strlen(baseName); + if (baseNameLength >= outNameBufSize) { + AKLOGE("outNameBufSize is too small. filePath: %s, outNameBufSize: %d", + filePath, outNameBufSize); + return; + } + snprintf(outName, baseNameLength + 1 /* terminator */, "%s", baseName); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/file_utils.h b/native/jni/src/dictionary/utils/file_utils.h new file mode 100644 index 000000000..4f1b93a6a --- /dev/null +++ b/native/jni/src/dictionary/utils/file_utils.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FILE_UTILS_H +#define LATINIME_FILE_UTILS_H + +#include "defines.h" + +namespace latinime { + +class FileUtils { + public: + // Returns -1 on error. + static int getFileSize(const char *const filePath); + + static bool existsDir(const char *const dirPath); + + // Remove a directory and all files in the directory. + static bool removeDirAndFiles(const char *const dirPath); + + static int getFilePathWithSuffixBufSize(const char *const filePath, const char *const suffix); + + static void getFilePathWithSuffix(const char *const filePath, const char *const suffix, + const int filePathBufSize, char *const outFilePath); + + static int getFilePathBufSize(const char *const dirPath, const char *const fileName); + + static void getFilePath(const char *const dirPath, const char *const fileName, + const int filePathBufSize, char *const outFilePath); + + // Returns whether the filePath have the suffix. + static bool getFilePathWithoutSuffix(const char *const filePath, const char *const suffix, + const int dirPathBufSize, char *const outDirPath); + + static void getDirPath(const char *const filePath, const int dirPathBufSize, + char *const outDirPath); + + static void getBasename(const char *const filePath, const int outNameBufSize, + char *const outName); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FileUtils); + + static bool removeDirAndFiles(const char *const dirPath, const int maxTries); +}; +} // namespace latinime +#endif /* LATINIME_FILE_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp new file mode 100644 index 000000000..d79ed911b --- /dev/null +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.cpp @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/forgetting_curve_utils.h" + +#include <algorithm> +#include <cmath> +#include <stdlib.h> + +#include "dictionary/header/header_policy.h" +#include "dictionary/utils/probability_utils.h" +#include "utils/time_keeper.h" + +namespace latinime { + +const int ForgettingCurveUtils::MULTIPLIER_TWO_IN_PROBABILITY_SCALE = 8; +const int ForgettingCurveUtils::DECAY_INTERVAL_SECONDS = 2 * 60 * 60; + +const int ForgettingCurveUtils::MAX_LEVEL = 15; +const int ForgettingCurveUtils::MIN_VISIBLE_LEVEL = 2; +const int ForgettingCurveUtils::MAX_ELAPSED_TIME_STEP_COUNT = 31; +const int ForgettingCurveUtils::DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD = 30; +const int ForgettingCurveUtils::OCCURRENCES_TO_RAISE_THE_LEVEL = 1; +// TODO: Evaluate whether this should be 7.5 days. +// 15 days +const int ForgettingCurveUtils::DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS = 15 * 24 * 60 * 60; + +const float ForgettingCurveUtils::ENTRY_COUNT_HARD_LIMIT_WEIGHT = 1.2; + +const ForgettingCurveUtils::ProbabilityTable ForgettingCurveUtils::sProbabilityTable; + +// TODO: Revise the logic to decide the initial probability depending on the given probability. +/* static */ const HistoricalInfo ForgettingCurveUtils::createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy) { + const int timestamp = newHistoricalInfo->getTimestamp(); + if (newProbability != NOT_A_PROBABILITY && originalHistoricalInfo->getLevel() == 0) { + // Add entry as a valid word. + const int level = clampToVisibleEntryLevelRange(newHistoricalInfo->getLevel()); + const int count = clampToValidCountRange(newHistoricalInfo->getCount(), headerPolicy); + return HistoricalInfo(timestamp, level, count); + } else if (!originalHistoricalInfo->isValid() + || originalHistoricalInfo->getLevel() < newHistoricalInfo->getLevel() + || (originalHistoricalInfo->getLevel() == newHistoricalInfo->getLevel() + && originalHistoricalInfo->getCount() < newHistoricalInfo->getCount())) { + // Initial information. + int count = newHistoricalInfo->getCount(); + if (count >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel() + 1); + return HistoricalInfo(timestamp, level, 0 /* count */); + } + const int level = clampToValidLevelRange(newHistoricalInfo->getLevel()); + return HistoricalInfo(timestamp, level, clampToValidCountRange(count, headerPolicy)); + } else { + const int updatedCount = originalHistoricalInfo->getCount() + 1; + if (updatedCount >= OCCURRENCES_TO_RAISE_THE_LEVEL) { + // The count exceeds the max value the level can be incremented. + if (originalHistoricalInfo->getLevel() >= MAX_LEVEL) { + // The level is already max. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel(), originalHistoricalInfo->getCount()); + } else { + // Raise the level. + return HistoricalInfo(timestamp, + originalHistoricalInfo->getLevel() + 1, 0 /* count */); + } + } else { + return HistoricalInfo(timestamp, originalHistoricalInfo->getLevel(), updatedCount); + } + } +} + +/* static */ int ForgettingCurveUtils::decodeProbability( + const HistoricalInfo *const historicalInfo, const HeaderPolicy *const headerPolicy) { + const int elapsedTimeStepCount = getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS); + return sProbabilityTable.getProbability( + headerPolicy->getForgettingCurveProbabilityValuesTableId(), + clampToValidLevelRange(historicalInfo->getLevel()), + clampToValidTimeStepCountRange(elapsedTimeStepCount)); +} + +/* static */ bool ForgettingCurveUtils::needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy) { + return historicalInfo->getLevel() > 0 + || getElapsedTimeStepCount(historicalInfo->getTimestamp(), + DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS) + < DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; +} + +/* static */ const HistoricalInfo ForgettingCurveUtils::createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy) { + if (originalHistoricalInfo->getTimestamp() == NOT_A_TIMESTAMP) { + return HistoricalInfo(); + } + const int durationToLevelDownInSeconds = DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + const int elapsedTimeStep = getElapsedTimeStepCount( + originalHistoricalInfo->getTimestamp(), durationToLevelDownInSeconds); + if (elapsedTimeStep <= MAX_ELAPSED_TIME_STEP_COUNT) { + // No need to update historical info. + return *originalHistoricalInfo; + } + // Lower the level. + const int maxLevelDownAmonut = elapsedTimeStep / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + const int levelDownAmount = (maxLevelDownAmonut >= originalHistoricalInfo->getLevel()) ? + originalHistoricalInfo->getLevel() : maxLevelDownAmonut; + const int adjustedTimestampInSeconds = originalHistoricalInfo->getTimestamp() + + levelDownAmount * durationToLevelDownInSeconds; + return HistoricalInfo(adjustedTimestampInSeconds, + originalHistoricalInfo->getLevel() - levelDownAmount, 0 /* count */); +} + +/* static */ bool ForgettingCurveUtils::needsToDecay(const bool mindsBlockByDecay, + const EntryCounts &entryCounts, const HeaderPolicy *const headerPolicy) { + const EntryCounts &maxNgramCounts = headerPolicy->getMaxNgramCounts(); + for (const auto ngramType : AllNgramTypes::ASCENDING) { + if (entryCounts.getNgramCount(ngramType) + >= getEntryCountHardLimit(maxNgramCounts.getNgramCount(ngramType))) { + // Unigram count exceeds the limit. + return true; + } + } + if (mindsBlockByDecay) { + return false; + } + if (headerPolicy->getLastDecayedTime() + DECAY_INTERVAL_SECONDS + < TimeKeeper::peekCurrentTime()) { + // Time to decay. + return true; + } + return false; +} + +// See comments in ProbabilityUtils::backoff(). +/* static */ int ForgettingCurveUtils::backoff(const int unigramProbability) { + // See TODO comments in ForgettingCurveUtils::getProbability(). + return unigramProbability; +} + +/* static */ int ForgettingCurveUtils::getElapsedTimeStepCount(const int timestamp, + const int durationToLevelDownInSeconds) { + const int elapsedTimeInSeconds = TimeKeeper::peekCurrentTime() - timestamp; + const int timeStepDurationInSeconds = + durationToLevelDownInSeconds / (MAX_ELAPSED_TIME_STEP_COUNT + 1); + return elapsedTimeInSeconds / timeStepDurationInSeconds; +} + +/* static */ int ForgettingCurveUtils::clampToVisibleEntryLevelRange(const int level) { + return std::min(std::max(level, MIN_VISIBLE_LEVEL), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidCountRange(const int count, + const HeaderPolicy *const headerPolicy) { + return std::min(std::max(count, 0), OCCURRENCES_TO_RAISE_THE_LEVEL - 1); +} + +/* static */ int ForgettingCurveUtils::clampToValidLevelRange(const int level) { + return std::min(std::max(level, 0), MAX_LEVEL); +} + +/* static */ int ForgettingCurveUtils::clampToValidTimeStepCountRange(const int timeStepCount) { + return std::min(std::max(timeStepCount, 0), MAX_ELAPSED_TIME_STEP_COUNT); +} + +const int ForgettingCurveUtils::ProbabilityTable::PROBABILITY_TABLE_COUNT = 4; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_PROBABILITY_TABLE_ID = 0; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_PROBABILITY_TABLE_ID = 1; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_PROBABILITY_TABLE_ID = 2; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_PROBABILITY_TABLE_ID = 3; +const int ForgettingCurveUtils::ProbabilityTable::WEAK_MAX_PROBABILITY = 127; +const int ForgettingCurveUtils::ProbabilityTable::MODEST_BASE_PROBABILITY = 8; +const int ForgettingCurveUtils::ProbabilityTable::STRONG_BASE_PROBABILITY = 9; +const int ForgettingCurveUtils::ProbabilityTable::AGGRESSIVE_BASE_PROBABILITY = 10; + + +ForgettingCurveUtils::ProbabilityTable::ProbabilityTable() : mTables() { + mTables.resize(PROBABILITY_TABLE_COUNT); + for (int tableId = 0; tableId < PROBABILITY_TABLE_COUNT; ++tableId) { + mTables[tableId].resize(MAX_LEVEL + 1); + for (int level = 0; level <= MAX_LEVEL; ++level) { + mTables[tableId][level].resize(MAX_ELAPSED_TIME_STEP_COUNT + 1); + const float initialProbability = getBaseProbabilityForLevel(tableId, level); + const float endProbability = getBaseProbabilityForLevel(tableId, level - 1); + for (int timeStepCount = 0; timeStepCount <= MAX_ELAPSED_TIME_STEP_COUNT; + ++timeStepCount) { + if (level < MIN_VISIBLE_LEVEL) { + mTables[tableId][level][timeStepCount] = NOT_A_PROBABILITY; + continue; + } + const float probability = initialProbability + * powf(initialProbability / endProbability, + -1.0f * static_cast<float>(timeStepCount) + / static_cast<float>(MAX_ELAPSED_TIME_STEP_COUNT + 1)); + mTables[tableId][level][timeStepCount] = + std::min(std::max(static_cast<int>(probability), 1), MAX_PROBABILITY); + } + } + } +} + +/* static */ int ForgettingCurveUtils::ProbabilityTable::getBaseProbabilityForLevel( + const int tableId, const int level) { + if (tableId == WEAK_PROBABILITY_TABLE_ID) { + // Max probability is 127. + return static_cast<float>(WEAK_MAX_PROBABILITY / (1 << (MAX_LEVEL - level))); + } else if (tableId == MODEST_PROBABILITY_TABLE_ID) { + // Max probability is 128. + return static_cast<float>(MODEST_BASE_PROBABILITY * (level + 1)); + } else if (tableId == STRONG_PROBABILITY_TABLE_ID) { + // Max probability is 140. + return static_cast<float>(STRONG_BASE_PROBABILITY * (level + 1)); + } else if (tableId == AGGRESSIVE_PROBABILITY_TABLE_ID) { + // Max probability is 160. + return static_cast<float>(AGGRESSIVE_BASE_PROBABILITY * (level + 1)); + } else { + return NOT_A_PROBABILITY; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/forgetting_curve_utils.h b/native/jni/src/dictionary/utils/forgetting_curve_utils.h new file mode 100644 index 000000000..ddaac7e3b --- /dev/null +++ b/native/jni/src/dictionary/utils/forgetting_curve_utils.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORGETTING_CURVE_UTILS_H +#define LATINIME_FORGETTING_CURVE_UTILS_H + +#include <vector> + +#include "defines.h" +#include "dictionary/property/historical_info.h" +#include "dictionary/utils/entry_counters.h" + +namespace latinime { + +class HeaderPolicy; + +class ForgettingCurveUtils { + public: + static const HistoricalInfo createUpdatedHistoricalInfo( + const HistoricalInfo *const originalHistoricalInfo, const int newProbability, + const HistoricalInfo *const newHistoricalInfo, const HeaderPolicy *const headerPolicy); + + static const HistoricalInfo createHistoricalInfoToSave( + const HistoricalInfo *const originalHistoricalInfo, + const HeaderPolicy *const headerPolicy); + + static int decodeProbability(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToKeep(const HistoricalInfo *const historicalInfo, + const HeaderPolicy *const headerPolicy); + + static bool needsToDecay(const bool mindsBlockByDecay, const EntryCounts &entryCounters, + const HeaderPolicy *const headerPolicy); + + // TODO: Improve probability computation method and remove this. + static int getProbabilityBiasForNgram(const int n) { + return (n - 1) * MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + } + + AK_FORCE_INLINE static int getEntryCountHardLimit(const int maxEntryCount) { + return static_cast<int>(static_cast<float>(maxEntryCount) + * ENTRY_COUNT_HARD_LIMIT_WEIGHT); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ForgettingCurveUtils); + + class ProbabilityTable { + public: + ProbabilityTable(); + + int getProbability(const int tableId, const int level, + const int elapsedTimeStepCount) const { + return mTables[tableId][level][elapsedTimeStepCount]; + } + + private: + DISALLOW_COPY_AND_ASSIGN(ProbabilityTable); + + static const int PROBABILITY_TABLE_COUNT; + static const int WEAK_PROBABILITY_TABLE_ID; + static const int MODEST_PROBABILITY_TABLE_ID; + static const int STRONG_PROBABILITY_TABLE_ID; + static const int AGGRESSIVE_PROBABILITY_TABLE_ID; + + static const int WEAK_MAX_PROBABILITY; + static const int MODEST_BASE_PROBABILITY; + static const int STRONG_BASE_PROBABILITY; + static const int AGGRESSIVE_BASE_PROBABILITY; + + std::vector<std::vector<std::vector<int>>> mTables; + + static int getBaseProbabilityForLevel(const int tableId, const int level); + }; + + static const int MULTIPLIER_TWO_IN_PROBABILITY_SCALE; + static const int DECAY_INTERVAL_SECONDS; + + static const int MAX_LEVEL; + static const int MIN_VISIBLE_LEVEL; + static const int MAX_ELAPSED_TIME_STEP_COUNT; + static const int DISCARD_LEVEL_ZERO_ENTRY_TIME_STEP_COUNT_THRESHOLD; + static const int OCCURRENCES_TO_RAISE_THE_LEVEL; + static const int DURATION_TO_LOWER_THE_LEVEL_IN_SECONDS; + + static const float ENTRY_COUNT_HARD_LIMIT_WEIGHT; + + static const ProbabilityTable sProbabilityTable; + + static int backoff(const int unigramProbability); + static int getElapsedTimeStepCount(const int timestamp, const int durationToLevelDown); + static int clampToVisibleEntryLevelRange(const int level); + static int clampToValidLevelRange(const int level); + static int clampToValidCountRange(const int count, const HeaderPolicy *const headerPolicy); + static int clampToValidTimeStepCountRange(const int timeStepCount); +}; +} // namespace latinime +#endif /* LATINIME_FORGETTING_CURVE_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/format_utils.cpp b/native/jni/src/dictionary/utils/format_utils.cpp new file mode 100644 index 000000000..cef3b094c --- /dev/null +++ b/native/jni/src/dictionary/utils/format_utils.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/format_utils.h" + +#include "dictionary/utils/byte_array_utils.h" + +namespace latinime { + +const uint32_t FormatUtils::MAGIC_NUMBER = 0x9BC13AFE; + +// Magic number (4 bytes), version (2 bytes), flags (2 bytes), header size (4 bytes) = 12 +const size_t FormatUtils::DICTIONARY_MINIMUM_SIZE = 12; + +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::getFormatVersion(const int formatVersion) { + switch (formatVersion) { + case VERSION_2: + case VERSION_201: + AKLOGE("Dictionary versions 2 and 201 are incompatible with this version"); + return UNKNOWN_VERSION; + case VERSION_202: + return VERSION_202; + case VERSION_4_ONLY_FOR_TESTING: + return VERSION_4_ONLY_FOR_TESTING; + case VERSION_402: + return VERSION_402; + case VERSION_403: + return VERSION_403; + default: + return UNKNOWN_VERSION; + } +} +/* static */ FormatUtils::FORMAT_VERSION FormatUtils::detectFormatVersion( + const ReadOnlyByteArrayView dictBuffer) { + // The magic number is stored big-endian. + // If the dictionary is less than 4 bytes, we can't even read the magic number, so we don't + // understand this format. + if (dictBuffer.size() < DICTIONARY_MINIMUM_SIZE) { + return UNKNOWN_VERSION; + } + const uint32_t magicNumber = ByteArrayUtils::readUint32(dictBuffer.data(), 0); + switch (magicNumber) { + case MAGIC_NUMBER: + // The layout of the header is as follows: + // Magic number (4 bytes) 0x9B 0xC1 0x3A 0xFE + // Dictionary format version number (2 bytes) + // Options (2 bytes) + // Header size (4 bytes) : integer, big endian + // Conceptually this converts the hardcoded value of the bytes in the file into + // the symbolic value we use in the code. But we want the constants to be the + // same so we use them for both here. + return getFormatVersion(ByteArrayUtils::readUint16(dictBuffer.data(), 4)); + default: + return UNKNOWN_VERSION; + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/format_utils.h b/native/jni/src/dictionary/utils/format_utils.h new file mode 100644 index 000000000..1616efcce --- /dev/null +++ b/native/jni/src/dictionary/utils/format_utils.h @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_FORMAT_UTILS_H +#define LATINIME_FORMAT_UTILS_H + +#include <cstdint> + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Methods to handle binary dictionary format version. + */ +class FormatUtils { + public: + enum FORMAT_VERSION { + // These MUST have the same values as the relevant constants in FormatSpec.java. + // TODO: Remove VERSION_2 and VERSION_201 when we: + // * Confirm that old versions of LatinIME download old-format dictionaries + // * We no longer need the corresponding constants on the Java side for dicttool + VERSION_2 = 2, + VERSION_201 = 201, + VERSION_202 = 202, + VERSION_4_ONLY_FOR_TESTING = 399, + VERSION_402 = 402, + VERSION_403 = 403, + UNKNOWN_VERSION = -1 + }; + + // 32 bit magic number is stored at the beginning of the dictionary header to reject + // unsupported or obsolete dictionary formats. + static const uint32_t MAGIC_NUMBER; + + static FORMAT_VERSION getFormatVersion(const int formatVersion); + static FORMAT_VERSION detectFormatVersion(const ReadOnlyByteArrayView dictBuffer); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(FormatUtils); + + static const size_t DICTIONARY_MINIMUM_SIZE; +}; +} // namespace latinime +#endif /* LATINIME_FORMAT_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.cpp b/native/jni/src/dictionary/utils/mmapped_buffer.cpp new file mode 100644 index 000000000..c5259de6d --- /dev/null +++ b/native/jni/src/dictionary/utils/mmapped_buffer.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/mmapped_buffer.h" + +#include <cerrno> +#include <climits> +#include <cstdio> +#include <fcntl.h> +#include <sys/mman.h> +#include <unistd.h> + +#include "dictionary/utils/file_utils.h" + +namespace latinime { + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const int bufferOffset, const int bufferSize, + const bool isUpdatable) { + const int mmapFd = open(path, O_RDONLY); + if (mmapFd < 0) { + AKLOGE("DICT: Can't open the source. path=%s errno=%d", path, errno); + return nullptr; + } + const int pagesize = sysconf(_SC_PAGESIZE); + const int offset = bufferOffset % pagesize; + int alignedOffset = bufferOffset - offset; + int alignedSize = bufferSize + offset; + const int protMode = isUpdatable ? PROT_READ | PROT_WRITE : PROT_READ; + void *const mmappedBuffer = mmap(0, alignedSize, protMode, MAP_PRIVATE, mmapFd, + alignedOffset); + if (mmappedBuffer == MAP_FAILED) { + AKLOGE("DICT: Can't mmap dictionary. errno=%d", errno); + close(mmapFd); + return nullptr; + } + uint8_t *const buffer = static_cast<uint8_t *>(mmappedBuffer) + offset; + if (!buffer) { + AKLOGE("DICT: buffer is null"); + close(mmapFd); + return nullptr; + } + return MmappedBufferPtr(new MmappedBuffer(buffer, bufferSize, mmappedBuffer, alignedSize, + mmapFd, isUpdatable)); +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const path, const bool isUpdatable) { + const int fileSize = FileUtils::getFileSize(path); + if (fileSize == -1) { + return nullptr; + } else if (fileSize == 0) { + return MmappedBufferPtr(new MmappedBuffer(isUpdatable)); + } else { + return openBuffer(path, 0 /* bufferOffset */, fileSize, isUpdatable); + } +} + +/* static */ MmappedBuffer::MmappedBufferPtr MmappedBuffer::openBuffer( + const char *const dirPath, const char *const fileName, const bool isUpdatable) { + const int filePathBufferSize = PATH_MAX + 1 /* terminator */; + char filePath[filePathBufferSize]; + const int filePathLength = snprintf(filePath, filePathBufferSize, "%s%s", dirPath, + fileName); + if (filePathLength >= filePathBufferSize) { + return nullptr; + } + return openBuffer(filePath, isUpdatable); +} + +MmappedBuffer::~MmappedBuffer() { + if (mAlignedSize == 0) { + return; + } + int ret = munmap(mMmappedBuffer, mAlignedSize); + if (ret != 0) { + AKLOGE("DICT: Failure in munmap. ret=%d errno=%d", ret, errno); + } + ret = close(mMmapFd); + if (ret != 0) { + AKLOGE("DICT: Failure in close. ret=%d errno=%d", ret, errno); + } +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/mmapped_buffer.h b/native/jni/src/dictionary/utils/mmapped_buffer.h new file mode 100644 index 000000000..e25310373 --- /dev/null +++ b/native/jni/src/dictionary/utils/mmapped_buffer.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MMAPPED_BUFFER_H +#define LATINIME_MMAPPED_BUFFER_H + +#include <cstdint> +#include <memory> + +#include "defines.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +class MmappedBuffer { + public: + typedef std::unique_ptr<const MmappedBuffer> MmappedBufferPtr; + + static MmappedBufferPtr openBuffer(const char *const path, + const int bufferOffset, const int bufferSize, const bool isUpdatable); + + // Mmap entire file. + static MmappedBufferPtr openBuffer(const char *const path, const bool isUpdatable); + + static MmappedBufferPtr openBuffer(const char *const dirPath, const char *const fileName, + const bool isUpdatable); + + ~MmappedBuffer(); + + ReadWriteByteArrayView getReadWriteByteArrayView() const { + return mByteArrayView; + } + + ReadOnlyByteArrayView getReadOnlyByteArrayView() const { + return mByteArrayView.getReadOnlyView(); + } + + AK_FORCE_INLINE bool isUpdatable() const { + return mIsUpdatable; + } + + private: + AK_FORCE_INLINE MmappedBuffer(uint8_t *const buffer, const int bufferSize, + void *const mmappedBuffer, const int alignedSize, const int mmapFd, + const bool isUpdatable) + : mByteArrayView(buffer, bufferSize), mMmappedBuffer(mmappedBuffer), + mAlignedSize(alignedSize), mMmapFd(mmapFd), mIsUpdatable(isUpdatable) {} + + // Empty file. We have to handle an empty file as a valid part of a dictionary. + AK_FORCE_INLINE MmappedBuffer(const bool isUpdatable) + : mByteArrayView(), mMmappedBuffer(nullptr), mAlignedSize(0), + mMmapFd(0), mIsUpdatable(isUpdatable) {} + + DISALLOW_IMPLICIT_CONSTRUCTORS(MmappedBuffer); + + const ReadWriteByteArrayView mByteArrayView; + void *const mMmappedBuffer; + const int mAlignedSize; + const int mMmapFd; + const bool mIsUpdatable; +}; +} +#endif /* LATINIME_MMAPPED_BUFFER_H */ diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.cpp b/native/jni/src/dictionary/utils/multi_bigram_map.cpp new file mode 100644 index 000000000..e730fff8e --- /dev/null +++ b/native/jni/src/dictionary/utils/multi_bigram_map.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/multi_bigram_map.h" + +#include <cstddef> +#include <unordered_map> + +namespace latinime { + +// Max number of bigram maps (previous word contexts) to be cached. Increasing this number +// could improve bigram lookup speed for multi-word suggestions, but at the cost of more memory +// usage. Also, there are diminishing returns since the most frequently used bigrams are +// typically near the beginning of the input and are thus the first ones to be cached. Note +// that these bigrams are reset for each new composing word. +const size_t MultiBigramMap::MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP = 25; + +// Most common previous word contexts currently have 100 bigrams +const int MultiBigramMap::BigramMap::DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP = 100; + +// Look up the bigram probability for the given word pair from the cached bigram maps. +// Also caches the bigrams if there is space remaining and they have not been cached already. +int MultiBigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, + const int unigramProbability) { + if (prevWordIds.empty() || prevWordIds[0] == NOT_A_WORD_ID) { + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); + } + const auto mapPosition = mBigramMaps.find(prevWordIds[0]); + if (mapPosition != mBigramMaps.end()) { + return mapPosition->second.getBigramProbability(structurePolicy, nextWordId, + unigramProbability); + } + if (mBigramMaps.size() < MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP) { + addBigramsForWord(structurePolicy, prevWordIds); + return mBigramMaps[prevWordIds[0]].getBigramProbability(structurePolicy, + nextWordId, unigramProbability); + } + return readBigramProbabilityFromBinaryDictionary(structurePolicy, prevWordIds, + nextWordId, unigramProbability); +} + +void MultiBigramMap::BigramMap::init( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + structurePolicy->iterateNgramEntries(prevWordIds, this /* listener */); +} + +int MultiBigramMap::BigramMap::getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const { + int bigramProbability = NOT_A_PROBABILITY; + if (mBloomFilter.isInFilter(nextWordId)) { + const auto bigramProbabilityIt = mBigramMap.find(nextWordId); + if (bigramProbabilityIt != mBigramMap.end()) { + bigramProbability = bigramProbabilityIt->second; + } + } + return structurePolicy->getProbability(unigramProbability, bigramProbability); +} + +void MultiBigramMap::BigramMap::onVisitEntry(const int ngramProbability, const int targetWordId) { + if (targetWordId == NOT_A_WORD_ID) { + return; + } + mBigramMap[targetWordId] = ngramProbability; + mBloomFilter.setInFilter(targetWordId); +} + +void MultiBigramMap::addBigramsForWord( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds) { + mBigramMaps[prevWordIds[0]].init(structurePolicy, prevWordIds); +} + +int MultiBigramMap::readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability) { + const int bigramProbability = structurePolicy->getProbabilityOfWord(prevWordIds, nextWordId); + if (bigramProbability != NOT_A_PROBABILITY) { + return bigramProbability; + } + return structurePolicy->getProbability(unigramProbability, NOT_A_PROBABILITY); +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/multi_bigram_map.h b/native/jni/src/dictionary/utils/multi_bigram_map.h new file mode 100644 index 000000000..6f23d98bc --- /dev/null +++ b/native/jni/src/dictionary/utils/multi_bigram_map.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_MULTI_BIGRAM_MAP_H +#define LATINIME_MULTI_BIGRAM_MAP_H + +#include <cstddef> +#include <unordered_map> + +#include "defines.h" +#include "dictionary/interface/dictionary_structure_with_buffer_policy.h" +#include "dictionary/interface/ngram_listener.h" +#include "dictionary/utils/binary_dictionary_bigrams_iterator.h" +#include "dictionary/utils/bloom_filter.h" +#include "utils/int_array_view.h" + +namespace latinime { + +// Class for caching bigram maps for multiple previous word contexts. This is useful since the +// algorithm needs to look up the set of bigrams for every word pair that occurs in every +// multi-word suggestion. +class MultiBigramMap { + public: + MultiBigramMap() : mBigramMaps() {} + ~MultiBigramMap() {} + + // Look up the bigram probability for the given word pair from the cached bigram maps. + // Also caches the bigrams if there is space remaining and they have not been cached already. + int getBigramProbability(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + void clear() { + mBigramMaps.clear(); + } + + private: + DISALLOW_COPY_AND_ASSIGN(MultiBigramMap); + + class BigramMap : public NgramListener { + public: + BigramMap() : mBigramMap(DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP), mBloomFilter() {} + // Copy constructor needed for std::unordered_map. + BigramMap(const BigramMap &bigramMap) + : mBigramMap(bigramMap.mBigramMap), mBloomFilter(bigramMap.mBloomFilter) {} + virtual ~BigramMap() {} + + void init(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + int getBigramProbability( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const int nextWordId, const int unigramProbability) const; + virtual void onVisitEntry(const int ngramProbability, const int targetWordId); + + private: + static const int DEFAULT_HASH_MAP_SIZE_FOR_EACH_BIGRAM_MAP; + std::unordered_map<int, int> mBigramMap; + BloomFilter mBloomFilter; + }; + + void addBigramsForWord(const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds); + + int readBigramProbabilityFromBinaryDictionary( + const DictionaryStructureWithBufferPolicy *const structurePolicy, + const WordIdArrayView prevWordIds, const int nextWordId, const int unigramProbability); + + static const size_t MAX_CACHED_PREV_WORDS_IN_BIGRAM_MAP; + std::unordered_map<int, BigramMap> mBigramMaps; +}; +} // namespace latinime +#endif // LATINIME_MULTI_BIGRAM_MAP_H diff --git a/native/jni/src/dictionary/utils/probability_utils.cpp b/native/jni/src/dictionary/utils/probability_utils.cpp new file mode 100644 index 000000000..426a0e783 --- /dev/null +++ b/native/jni/src/dictionary/utils/probability_utils.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/probability_utils.h" + +namespace latinime { + +const float ProbabilityUtils::PROBABILITY_ENCODING_SCALER = 8.58923700372f; + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/probability_utils.h b/native/jni/src/dictionary/utils/probability_utils.h new file mode 100644 index 000000000..2050af1e9 --- /dev/null +++ b/native/jni/src/dictionary/utils/probability_utils.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_PROBABILITY_UTILS_H +#define LATINIME_PROBABILITY_UTILS_H + +#include <algorithm> +#include <cmath> + +#include "defines.h" + +namespace latinime { + +// TODO: Quit using bigram probability to indicate the delta. +class ProbabilityUtils { + public: + static AK_FORCE_INLINE int backoff(const int unigramProbability) { + return unigramProbability; + // For some reason, applying the backoff weight gives bad results in tests. To apply the + // backoff weight, we divide the probability by 2, which in our storing format means + // decreasing the score by 8. + // TODO: figure out what's wrong with this. + // return unigramProbability > 8 ? + // unigramProbability - 8 : (0 == unigramProbability ? 0 : 8); + } + + static AK_FORCE_INLINE int computeProbabilityForBigram( + const int unigramProbability, const int bigramProbability) { + // We divide the range [unigramProbability..255] in 16.5 steps - in other words, we want + // the unigram probability to be the median value of the 17th step from the top. A value of + // 0 for the bigram probability represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictEncoder#makeBigramFlags for details. + const float stepSize = static_cast<float>(MAX_PROBABILITY - unigramProbability) + / (1.5f + MAX_BIGRAM_ENCODED_PROBABILITY); + return unigramProbability + + static_cast<int>(static_cast<float>(bigramProbability + 1) * stepSize); + } + + // Encode probability using the same way as we are doing for main dictionaries. + static AK_FORCE_INLINE int encodeRawProbability(const float rawProbability) { + const float probability = static_cast<float>(MAX_PROBABILITY) + + log2f(rawProbability) * PROBABILITY_ENCODING_SCALER; + if (probability < 0.0f) { + return 0; + } + return std::min(static_cast<int>(probability + 0.5f), MAX_PROBABILITY); + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ProbabilityUtils); + + static const float PROBABILITY_ENCODING_SCALER; +}; +} +#endif /* LATINIME_PROBABILITY_UTILS_H */ diff --git a/native/jni/src/dictionary/utils/sparse_table.cpp b/native/jni/src/dictionary/utils/sparse_table.cpp new file mode 100644 index 000000000..029329fab --- /dev/null +++ b/native/jni/src/dictionary/utils/sparse_table.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/sparse_table.h" + +namespace latinime { + +const int SparseTable::NOT_EXIST = -1; +const int SparseTable::INDEX_SIZE = 4; + +bool SparseTable::contains(const int id) const { + const int readingPos = getPosInIndexTable(id); + if (id < 0 || mIndexTableBuffer->getTailPosition() <= readingPos) { + return false; + } + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, readingPos); + return index != NOT_EXIST; +} + +uint32_t SparseTable::get(const int id) const { + const int indexTableReadingPos = getPosInIndexTable(id); + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, indexTableReadingPos); + const int contentTableReadingPos = getPosInContentTable(id, index); + if (contentTableReadingPos < 0 + || contentTableReadingPos >= mContentTableBuffer->getTailPosition()) { + AKLOGE("contentTableReadingPos(%d) is invalid. id: %d, index: %d", + contentTableReadingPos, id, index); + return NOT_A_DICT_POS; + } + const int contentValue = mContentTableBuffer->readUint(mDataSize, contentTableReadingPos); + return contentValue == NOT_EXIST ? NOT_A_DICT_POS : contentValue; +} + +bool SparseTable::set(const int id, const uint32_t value) { + const int posInIndexTable = getPosInIndexTable(id); + // Extends the index table if needed. + int tailPos = mIndexTableBuffer->getTailPosition(); + while (tailPos <= posInIndexTable) { + if (!mIndexTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, INDEX_SIZE, &tailPos)) { + AKLOGE("cannot extend index table. tailPos: %d to: %d", tailPos, posInIndexTable); + return false; + } + } + if (contains(id)) { + // The entry is already in the content table. + const int index = mIndexTableBuffer->readUint(INDEX_SIZE, posInIndexTable); + if (!mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index))) { + AKLOGE("cannot update value %d. pos: %d, tailPos: %d, mDataSize: %d", value, + getPosInContentTable(id, index), mContentTableBuffer->getTailPosition(), + mDataSize); + return false; + } + return true; + } + // The entry is not in the content table. + // Create new entry in the content table. + const int index = getIndexFromContentTablePos(mContentTableBuffer->getTailPosition()); + if (!mIndexTableBuffer->writeUint(index, INDEX_SIZE, posInIndexTable)) { + AKLOGE("cannot write index %d. pos %d", index, posInIndexTable); + return false; + } + // Write a new block that containing the entry to be set. + int writingPos = getPosInContentTable(0 /* id */, index); + for (int i = 0; i < mBlockSize; ++i) { + if (!mContentTableBuffer->writeUintAndAdvancePosition(NOT_EXIST, mDataSize, + &writingPos)) { + AKLOGE("cannot write content table to extend. writingPos: %d, tailPos: %d, " + "mDataSize: %d", writingPos, mContentTableBuffer->getTailPosition(), mDataSize); + return false; + } + } + return mContentTableBuffer->writeUint(value, mDataSize, getPosInContentTable(id, index)); +} + +int SparseTable::getIndexFromContentTablePos(const int contentTablePos) const { + return contentTablePos / mDataSize / mBlockSize; +} + +int SparseTable::getPosInIndexTable(const int id) const { + return (id / mBlockSize) * INDEX_SIZE; +} + +int SparseTable::getPosInContentTable(const int id, const int index) const { + const int offset = id % mBlockSize; + return (index * mBlockSize + offset) * mDataSize; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/sparse_table.h b/native/jni/src/dictionary/utils/sparse_table.h new file mode 100644 index 000000000..bd1190e8b --- /dev/null +++ b/native/jni/src/dictionary/utils/sparse_table.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_SPARSE_TABLE_H +#define LATINIME_SPARSE_TABLE_H + +#include <cstdint> + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" + +namespace latinime { + +// TODO: Support multiple content buffers. +class SparseTable { + public: + SparseTable(BufferWithExtendableBuffer *const indexTableBuffer, + BufferWithExtendableBuffer *const contentTableBuffer, const int blockSize, + const int dataSize) + : mIndexTableBuffer(indexTableBuffer), mContentTableBuffer(contentTableBuffer), + mBlockSize(blockSize), mDataSize(dataSize) {} + + bool contains(const int id) const; + + uint32_t get(const int id) const; + + bool set(const int id, const uint32_t value); + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(SparseTable); + + int getIndexFromContentTablePos(const int contentTablePos) const; + + int getPosInIndexTable(const int id) const; + + int getPosInContentTable(const int id, const int index) const; + + static const int NOT_EXIST; + static const int INDEX_SIZE; + + BufferWithExtendableBuffer *const mIndexTableBuffer; + BufferWithExtendableBuffer *const mContentTableBuffer; + const int mBlockSize; + const int mDataSize; +}; +} // namespace latinime +#endif /* LATINIME_SPARSE_TABLE_H */ diff --git a/native/jni/src/dictionary/utils/trie_map.cpp b/native/jni/src/dictionary/utils/trie_map.cpp new file mode 100644 index 000000000..0bef8c702 --- /dev/null +++ b/native/jni/src/dictionary/utils/trie_map.cpp @@ -0,0 +1,460 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dictionary/utils/trie_map.h" + +#include "dictionary/utils/dict_file_writing_utils.h" + +namespace latinime { + +const int TrieMap::INVALID_INDEX = -1; +const int TrieMap::FIELD0_SIZE = 4; +const int TrieMap::FIELD1_SIZE = 3; +const int TrieMap::ENTRY_SIZE = FIELD0_SIZE + FIELD1_SIZE; +const uint32_t TrieMap::VALUE_FLAG = 0x400000; +const uint32_t TrieMap::VALUE_MASK = 0x3FFFFF; +const uint32_t TrieMap::INVALID_VALUE_IN_KEY_VALUE_ENTRY = VALUE_MASK; +const uint32_t TrieMap::TERMINAL_LINK_FLAG = 0x800000; +const uint32_t TrieMap::TERMINAL_LINK_MASK = 0x7FFFFF; +const int TrieMap::NUM_OF_BITS_USED_FOR_ONE_LEVEL = 5; +const uint32_t TrieMap::LABEL_MASK = 0x1F; +const int TrieMap::MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL = 1 << NUM_OF_BITS_USED_FOR_ONE_LEVEL; +const int TrieMap::ROOT_BITMAP_ENTRY_INDEX = 0; +const int TrieMap::ROOT_BITMAP_ENTRY_POS = MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL * FIELD0_SIZE; +const TrieMap::Entry TrieMap::EMPTY_BITMAP_ENTRY = TrieMap::Entry(0, 0); +const int TrieMap::TERMINAL_LINKED_ENTRY_COUNT = 2; // Value entry and bitmap entry. +const uint64_t TrieMap::MAX_VALUE = + (static_cast<uint64_t>(1) << ((FIELD0_SIZE + FIELD1_SIZE) * CHAR_BIT)) - 1; +const int TrieMap::MAX_BUFFER_SIZE = TERMINAL_LINK_MASK * ENTRY_SIZE; + +TrieMap::TrieMap() : mBuffer(MAX_BUFFER_SIZE) { + mBuffer.extend(ROOT_BITMAP_ENTRY_POS); + writeEntry(EMPTY_BITMAP_ENTRY, ROOT_BITMAP_ENTRY_INDEX); +} + +TrieMap::TrieMap(const ReadWriteByteArrayView buffer) + : mBuffer(buffer, BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE) {} + +void TrieMap::dump(const int from, const int to) const { + AKLOGI("BufSize: %d", mBuffer.getTailPosition()); + for (int i = from; i < to; ++i) { + AKLOGI("Entry[%d]: %x, %x", i, readField0(i), readField1(i)); + } + int unusedRegionSize = 0; + for (int i = 1; i <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; ++i) { + int index = readEmptyTableLink(i); + while (index != ROOT_BITMAP_ENTRY_INDEX) { + index = readField0(index); + unusedRegionSize += i; + } + } + AKLOGI("Unused Size: %d", unusedRegionSize); +} + +int TrieMap::getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast<uint32_t>(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return INVALID_INDEX; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (terminalEntry.hasTerminalLink()) { + return terminalEntry.getValueEntryIndex() + 1; + } + // Create a value entry and a bitmap entry. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return INVALID_INDEX; + } + if (!writeEntry(Entry(0, terminalEntry.getValue()), valueEntryIndex)) { + return INVALID_INDEX; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return INVALID_INDEX; + } + if (!writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex)) { + return INVALID_INDEX; + } + return valueEntryIndex + 1; +} + +const TrieMap::Result TrieMap::get(const int key, const int bitmapEntryIndex) const { + const uint32_t unsignedKey = static_cast<uint32_t>(key); + return getInternal(unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + 0 /* level */); +} + +bool TrieMap::put(const int key, const uint64_t value, const int bitmapEntryIndex) { + if (value > MAX_VALUE) { + return false; + } + const uint32_t unsignedKey = static_cast<uint32_t>(key); + return putInternal(unsignedKey, value, getBitShuffledKey(unsignedKey), bitmapEntryIndex, + readEntry(bitmapEntryIndex), 0 /* level */); +} + +bool TrieMap::save(FILE *const file) const { + return DictFileWritingUtils::writeBufferToFileTail(file, &mBuffer); +} + +bool TrieMap::remove(const int key, const int bitmapEntryIndex) { + const Entry bitmapEntry = readEntry(bitmapEntryIndex); + const uint32_t unsignedKey = static_cast<uint32_t>(key); + const int terminalEntryIndex = getTerminalEntryIndex( + unsignedKey, getBitShuffledKey(unsignedKey), bitmapEntry, 0 /* level */); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return false; + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , terminalEntryIndex)) { + return false; + } + if (terminalEntry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(terminalEntry.getValueEntryIndex() + 1); + if (!freeTable(terminalEntry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)){ + return false; + } + } + return true; +} + +/** + * Iterate next entry in a certain level. + * + * @param iterationState the iteration state that will be read and updated in this method. + * @param outKey the output key + * @return Result instance. mIsValid is false when all entries are iterated. + */ +const TrieMap::Result TrieMap::iterateNext(std::vector<TableIterationState> *const iterationState, + int *const outKey) const { + while (!iterationState->empty()) { + TableIterationState &state = iterationState->back(); + if (state.mTableSize <= state.mCurrentIndex) { + // Move to parent. + iterationState->pop_back(); + } else { + const int entryIndex = state.mTableIndex + state.mCurrentIndex; + state.mCurrentIndex += 1; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to child. + iterationState->emplace_back(popCount(entry.getBitmap()), entry.getTableIndex()); + } else if (entry.isValidTerminalEntry()) { + if (outKey) { + *outKey = entry.getKey(); + } + if (!entry.hasTerminalLink()) { + return Result(entry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = entry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); + } + } + } + // Visited all entries. + return Result(0, false, INVALID_INDEX); +} + +/** + * Shuffle bits of the key in the fixed order. + * + * This method is used as a hash function. This returns different values for different inputs. + */ +uint32_t TrieMap::getBitShuffledKey(const uint32_t key) const { + uint32_t shuffledKey = 0; + for (int i = 0; i < 4; ++i) { + const uint32_t keyPiece = (key >> (i * 8)) & 0xFF; + shuffledKey ^= ((keyPiece ^ (keyPiece << 7) ^ (keyPiece << 14) ^ (keyPiece << 21)) + & 0x11111111) << i; + } + return shuffledKey; +} + +bool TrieMap::writeValue(const uint64_t value, const int terminalEntryIndex) { + if (value < VALUE_MASK) { + // Write value into the terminal entry. + return writeField1(value | VALUE_FLAG, terminalEntryIndex); + } + // Create value entry and write value. + const int valueEntryIndex = allocateTable(TERMINAL_LINKED_ENTRY_COUNT); + if (valueEntryIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex)) { + return false; + } + if (!writeEntry(EMPTY_BITMAP_ENTRY, valueEntryIndex + 1)) { + return false; + } + return writeField1(valueEntryIndex | TERMINAL_LINK_FLAG, terminalEntryIndex); +} + +bool TrieMap::updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex) { + if (!terminalEntry.hasTerminalLink()) { + return writeValue(value, terminalEntryIndex); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + return writeEntry(Entry(value >> (FIELD1_SIZE * CHAR_BIT), value), valueEntryIndex); +} + +bool TrieMap::freeTable(const int tableIndex, const int entryCount) { + if (!writeField0(readEmptyTableLink(entryCount), tableIndex)) { + return false; + } + return writeEmptyTableLink(tableIndex, entryCount); +} + +/** + * Allocate table with entryCount-entries. Reuse freed table if possible. + */ +int TrieMap::allocateTable(const int entryCount) { + if (entryCount > 0 && entryCount <= MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL) { + const int tableIndex = readEmptyTableLink(entryCount); + if (tableIndex > 0) { + if (!writeEmptyTableLink(readField0(tableIndex), entryCount)) { + return INVALID_INDEX; + } + // Reuse the table. + return tableIndex; + } + } + // Allocate memory space at tail position of the buffer. + const int mapIndex = getTailEntryIndex(); + if (!mBuffer.extend(entryCount * ENTRY_SIZE)) { + return INVALID_INDEX; + } + return mapIndex; +} + +int TrieMap::getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const { + const int label = getLabel(hashedKey, level); + if (!exists(bitmapEntry.getBitmap(), label)) { + return INVALID_INDEX; + } + const int entryIndex = bitmapEntry.getTableIndex() + popCount(bitmapEntry.getBitmap(), label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Move to the next level. + return getTerminalEntryIndex(key, hashedKey, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + return INVALID_INDEX; + } + if (entry.getKey() == key) { + // Terminal entry is found. + return entryIndex; + } + return INVALID_INDEX; +} + +/** + * Get Result corresponding to the key. + * + * @param key the key. + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param level current level + * @return Result instance corresponding to the key. mIsValid indicates whether the key is in the + * map. + */ +const TrieMap::Result TrieMap::getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const { + const int terminalEntryIndex = getTerminalEntryIndex(key, hashedKey, + readEntry(bitmapEntryIndex), level); + if (terminalEntryIndex == INVALID_INDEX) { + // Not found. + return Result(0, false, INVALID_INDEX); + } + const Entry terminalEntry = readEntry(terminalEntryIndex); + if (!terminalEntry.hasTerminalLink()) { + return Result(terminalEntry.getValue(), true, INVALID_INDEX); + } + const int valueEntryIndex = terminalEntry.getValueEntryIndex(); + const Entry valueEntry = readEntry(valueEntryIndex); + return Result(valueEntry.getValueOfValueEntry(), true, valueEntryIndex + 1); +} + +/** + * Put key to value mapping to the map. + * + * @param key the key. + * @param value the value + * @param hashedKey the hashed key. + * @param bitmapEntryIndex the index of bitmap entry + * @param bitmapEntry the bitmap entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level) { + const int label = getLabel(hashedKey, level); + const uint32_t bitmap = bitmapEntry.getBitmap(); + const int mapIndex = bitmapEntry.getTableIndex(); + if (!exists(bitmap, label)) { + // Current map doesn't contain the label. + return addNewEntryByExpandingTable(key, value, mapIndex, bitmap, bitmapEntryIndex, label); + } + const int entryIndex = mapIndex + popCount(bitmap, label); + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Bitmap entry is found. Go to the next level. + return putInternal(key, value, hashedKey, entryIndex, entry, level + 1); + } + if (!entry.isValidTerminalEntry()) { + // Overwrite invalid terminal entry. + return writeTerminalEntry(key, value, entryIndex); + } + if (entry.getKey() == key) { + // Terminal entry for the key is found. Update the value. + return updateValue(entry, value, entryIndex); + } + // Conflict with the existing key. + return addNewEntryByResolvingConflict(key, value, hashedKey, entry, entryIndex, level); +} + +/** + * Resolve a conflict in the current level and add new entry. + * + * @param key the key + * @param value the value + * @param hashedKey the hashed key + * @param conflictedEntry the existing conflicted entry + * @param conflictedEntryIndex the index of existing conflicted entry + * @param level current level + * @return whether the key-value has been correctly inserted to the map or not. + */ +bool TrieMap::addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level) { + const int conflictedKeyNextLabel = + getLabel(getBitShuffledKey(conflictedEntry.getKey()), level + 1); + const int nextLabel = getLabel(hashedKey, level + 1); + if (conflictedKeyNextLabel == nextLabel) { + // Conflicted again in the next level. + const int newTableIndex = allocateTable(1 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + const Entry newBitmapEntry(setExist(0 /* bitmap */, nextLabel), newTableIndex); + if (!writeEntry(newBitmapEntry, conflictedEntryIndex)) { + return false; + } + return putInternal(key, value, hashedKey, conflictedEntryIndex, newBitmapEntry, level + 1); + } + // The conflict has been resolved. Create a table that contains 2 entries. + const int newTableIndex = allocateTable(2 /* entryCount */); + if (newTableIndex == INVALID_INDEX) { + return false; + } + if (nextLabel < conflictedKeyNextLabel) { + if (!writeTerminalEntry(key, value, newTableIndex)) { + return false; + } + if (!writeEntry(conflictedEntry, newTableIndex + 1)) { + return false; + } + } else { // nextLabel > conflictedKeyNextLabel + if (!writeEntry(conflictedEntry, newTableIndex)) { + return false; + } + if (!writeTerminalEntry(key, value, newTableIndex + 1)) { + return false; + } + } + const uint32_t updatedBitmap = + setExist(setExist(0 /* bitmap */, nextLabel), conflictedKeyNextLabel); + return writeEntry(Entry(updatedBitmap, newTableIndex), conflictedEntryIndex); +} + +/** + * Add new entry to the existing table. + */ +bool TrieMap::addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, const int label) { + // Current map doesn't contain the label. + const int entryCount = popCount(bitmap); + const int newTableIndex = allocateTable(entryCount + 1); + if (newTableIndex == INVALID_INDEX) { + return false; + } + const int newEntryIndexInTable = popCount(bitmap, label); + // Copy from existing table to the new table. + for (int i = 0; i < entryCount; ++i) { + if (!copyEntry(tableIndex + i, newTableIndex + i + (i >= newEntryIndexInTable ? 1 : 0))) { + return false; + } + } + // Write new terminal entry. + if (!writeTerminalEntry(key, value, newTableIndex + newEntryIndexInTable)) { + return false; + } + // Update bitmap. + if (!writeEntry(Entry(setExist(bitmap, label), newTableIndex), bitmapEntryIndex)) { + return false; + } + if (entryCount > 0) { + return freeTable(tableIndex, entryCount); + } + return true; +} + +bool TrieMap::removeInner(const Entry &bitmapEntry) { + const int tableSize = popCount(bitmapEntry.getBitmap()); + if (tableSize <= 0) { + // The table is empty. No need to remove any entries. + return true; + } + for (int i = 0; i < tableSize; ++i) { + const int entryIndex = bitmapEntry.getTableIndex() + i; + const Entry entry = readEntry(entryIndex); + if (entry.isBitmapEntry()) { + // Delete next bitmap entry recursively. + if (!removeInner(entry)) { + return false; + } + } else { + // Invalidate terminal entry just in case. + if (!writeField1(VALUE_FLAG ^ INVALID_VALUE_IN_KEY_VALUE_ENTRY , entryIndex)) { + return false; + } + if (entry.hasTerminalLink()) { + const Entry nextLevelBitmapEntry = readEntry(entry.getValueEntryIndex() + 1); + if (!freeTable(entry.getValueEntryIndex(), TERMINAL_LINKED_ENTRY_COUNT)) { + return false; + } + if (!removeInner(nextLevelBitmapEntry)) { + return false; + } + } + } + } + return true; +} + +} // namespace latinime diff --git a/native/jni/src/dictionary/utils/trie_map.h b/native/jni/src/dictionary/utils/trie_map.h new file mode 100644 index 000000000..5fc6c2690 --- /dev/null +++ b/native/jni/src/dictionary/utils/trie_map.h @@ -0,0 +1,399 @@ +/* + * Copyright (C) 2014, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TRIE_MAP_H +#define LATINIME_TRIE_MAP_H + +#include <climits> +#include <cstdint> +#include <cstdio> +#include <vector> + +#include "defines.h" +#include "dictionary/utils/buffer_with_extendable_buffer.h" +#include "utils/byte_array_view.h" + +namespace latinime { + +/** + * Trie map derived from Phil Bagwell's Hash Array Mapped Trie. + * key is int and value is uint64_t. + * This supports multiple level map. Terminal entries can have a bitmap for the next level map. + * This doesn't support root map resizing. + */ +class TrieMap { + public: + struct Result { + const uint64_t mValue; + const bool mIsValid; + const int mNextLevelBitmapEntryIndex; + + Result(const uint64_t value, const bool isValid, const int nextLevelBitmapEntryIndex) + : mValue(value), mIsValid(isValid), + mNextLevelBitmapEntryIndex(nextLevelBitmapEntryIndex) {} + }; + + /** + * Struct to record iteration state in a table. + */ + struct TableIterationState { + int mTableSize; + int mTableIndex; + int mCurrentIndex; + + TableIterationState(const int tableSize, const int tableIndex) + : mTableSize(tableSize), mTableIndex(tableIndex), mCurrentIndex(0) {} + }; + + class TrieMapRange; + class TrieMapIterator { + public: + class IterationResult { + public: + IterationResult(const TrieMap *const trieMap, const int key, const uint64_t value, + const int nextLeveBitmapEntryIndex) + : mTrieMap(trieMap), mKey(key), mValue(value), + mNextLevelBitmapEntryIndex(nextLeveBitmapEntryIndex) {} + + const TrieMapRange getEntriesInNextLevel() const { + return TrieMapRange(mTrieMap, mNextLevelBitmapEntryIndex); + } + + bool hasNextLevelMap() const { + return mNextLevelBitmapEntryIndex != INVALID_INDEX; + } + + AK_FORCE_INLINE int key() const { + return mKey; + } + + AK_FORCE_INLINE uint64_t value() const { + return mValue; + } + + AK_FORCE_INLINE int getNextLevelBitmapEntryIndex() const { + return mNextLevelBitmapEntryIndex; + } + + private: + const TrieMap *const mTrieMap; + const int mKey; + const uint64_t mValue; + const int mNextLevelBitmapEntryIndex; + }; + + TrieMapIterator(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mStateStack(), mBaseBitmapEntryIndex(bitmapEntryIndex), + mKey(0), mValue(0), mIsValid(false), mNextLevelBitmapEntryIndex(INVALID_INDEX) { + if (!trieMap || mBaseBitmapEntryIndex == INVALID_INDEX) { + return; + } + const Entry bitmapEntry = mTrieMap->readEntry(mBaseBitmapEntryIndex); + mStateStack.emplace_back( + mTrieMap->popCount(bitmapEntry.getBitmap()), bitmapEntry.getTableIndex()); + this->operator++(); + } + + const IterationResult operator*() const { + return IterationResult(mTrieMap, mKey, mValue, mNextLevelBitmapEntryIndex); + } + + bool operator!=(const TrieMapIterator &other) const { + // Caveat: This works only for for loops. + return mIsValid || other.mIsValid; + } + + const TrieMapIterator &operator++() { + const Result result = mTrieMap->iterateNext(&mStateStack, &mKey); + mValue = result.mValue; + mIsValid = result.mIsValid; + mNextLevelBitmapEntryIndex = result.mNextLevelBitmapEntryIndex; + return *this; + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapIterator); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapIterator); + + const TrieMap *const mTrieMap; + std::vector<TrieMap::TableIterationState> mStateStack; + const int mBaseBitmapEntryIndex; + int mKey; + uint64_t mValue; + bool mIsValid; + int mNextLevelBitmapEntryIndex; + }; + + /** + * Class to support iterating entries in TrieMap by range base for loops. + */ + class TrieMapRange { + public: + TrieMapRange(const TrieMap *const trieMap, const int bitmapEntryIndex) + : mTrieMap(trieMap), mBaseBitmapEntryIndex(bitmapEntryIndex) {}; + + TrieMapIterator begin() const { + return TrieMapIterator(mTrieMap, mBaseBitmapEntryIndex); + } + + const TrieMapIterator end() const { + return TrieMapIterator(nullptr, INVALID_INDEX); + } + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(TrieMapRange); + DISALLOW_ASSIGNMENT_OPERATOR(TrieMapRange); + + const TrieMap *const mTrieMap; + const int mBaseBitmapEntryIndex; + }; + + static const int INVALID_INDEX; + static const uint64_t MAX_VALUE; + + TrieMap(); + // Construct TrieMap using existing data in the memory region written by save(). + TrieMap(const ReadWriteByteArrayView buffer); + void dump(const int from = 0, const int to = 0) const; + + bool isNearSizeLimit() const { + return mBuffer.isNearSizeLimit(); + } + + int getRootBitmapEntryIndex() const { + return ROOT_BITMAP_ENTRY_INDEX; + } + + // Returns bitmapEntryIndex. Create the next level map if it doesn't exist. + int getNextLevelBitmapEntryIndex(const int key) { + return getNextLevelBitmapEntryIndex(key, ROOT_BITMAP_ENTRY_INDEX); + } + + int getNextLevelBitmapEntryIndex(const int key, const int bitmapEntryIndex); + + const Result getRoot(const int key) const { + return get(key, ROOT_BITMAP_ENTRY_INDEX); + } + + const Result get(const int key, const int bitmapEntryIndex) const; + + bool putRoot(const int key, const uint64_t value) { + return put(key, value, ROOT_BITMAP_ENTRY_INDEX); + } + + bool put(const int key, const uint64_t value, const int bitmapEntryIndex); + + const TrieMapRange getEntriesInRootLevel() const { + return getEntriesInSpecifiedLevel(ROOT_BITMAP_ENTRY_INDEX); + } + + const TrieMapRange getEntriesInSpecifiedLevel(const int bitmapEntryIndex) const { + return TrieMapRange(this, bitmapEntryIndex); + } + + bool save(FILE *const file) const; + + bool remove(const int key, const int bitmapEntryIndex); + + private: + DISALLOW_COPY_AND_ASSIGN(TrieMap); + + /** + * Struct represents an entry. + * + * Entry is one of these entry types. All entries are fixed size and have 2 fields FIELD_0 and + * FIELD_1. + * 1. bitmap entry. bitmap entry contains bitmap and the link to hash table. + * FIELD_0(bitmap) FIELD_1(LINK_TO_HASH_TABLE) + * 2. terminal entry. terminal entry contains hashed key and value or terminal link. terminal + * entry have terminal link when the value is not fit to FIELD_1 or there is a next level map + * for the key. + * FIELD_0(hashed key) (FIELD_1(VALUE_FLAG VALUE) | FIELD_1(TERMINAL_LINK_FLAG TERMINAL_LINK)) + * 3. value entry. value entry represents a value. Upper order bytes are stored in FIELD_0 and + * lower order bytes are stored in FIELD_1. + * FIELD_0(value (upper order bytes)) FIELD_1(value (lower order bytes)) + */ + struct Entry { + const uint32_t mData0; + const uint32_t mData1; + + Entry(const uint32_t data0, const uint32_t data1) : mData0(data0), mData1(data1) {} + + AK_FORCE_INLINE bool isBitmapEntry() const { + return (mData1 & VALUE_FLAG) == 0 && (mData1 & TERMINAL_LINK_FLAG) == 0; + } + + AK_FORCE_INLINE bool hasTerminalLink() const { + return (mData1 & TERMINAL_LINK_FLAG) != 0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getKey() const { + return mData0; + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValue() const { + return mData1 & VALUE_MASK; + } + + // For terminal entry. + AK_FORCE_INLINE bool isValidTerminalEntry() const { + return hasTerminalLink() || ((mData1 & VALUE_MASK) != INVALID_VALUE_IN_KEY_VALUE_ENTRY); + } + + // For terminal entry. + AK_FORCE_INLINE uint32_t getValueEntryIndex() const { + return mData1 & TERMINAL_LINK_MASK; + } + + // For bitmap entry. + AK_FORCE_INLINE uint32_t getBitmap() const { + return mData0; + } + + // For bitmap entry. + AK_FORCE_INLINE int getTableIndex() const { + return static_cast<int>(mData1); + } + + // For value entry. + AK_FORCE_INLINE uint64_t getValueOfValueEntry() const { + return ((static_cast<uint64_t>(mData0) << (FIELD1_SIZE * CHAR_BIT)) ^ mData1); + } + }; + + BufferWithExtendableBuffer mBuffer; + + static const int FIELD0_SIZE; + static const int FIELD1_SIZE; + static const int ENTRY_SIZE; + static const uint32_t VALUE_FLAG; + static const uint32_t VALUE_MASK; + static const uint32_t INVALID_VALUE_IN_KEY_VALUE_ENTRY; + static const uint32_t TERMINAL_LINK_FLAG; + static const uint32_t TERMINAL_LINK_MASK; + static const int NUM_OF_BITS_USED_FOR_ONE_LEVEL; + static const uint32_t LABEL_MASK; + static const int MAX_NUM_OF_ENTRIES_IN_ONE_LEVEL; + static const int ROOT_BITMAP_ENTRY_INDEX; + static const int ROOT_BITMAP_ENTRY_POS; + static const Entry EMPTY_BITMAP_ENTRY; + static const int TERMINAL_LINKED_ENTRY_COUNT; + static const int MAX_BUFFER_SIZE; + + uint32_t getBitShuffledKey(const uint32_t key) const; + bool writeValue(const uint64_t value, const int terminalEntryIndex); + bool updateValue(const Entry &terminalEntry, const uint64_t value, + const int terminalEntryIndex); + bool freeTable(const int tableIndex, const int entryCount); + int allocateTable(const int entryCount); + int getTerminalEntryIndex(const uint32_t key, const uint32_t hashedKey, + const Entry &bitmapEntry, const int level) const; + const Result getInternal(const uint32_t key, const uint32_t hashedKey, + const int bitmapEntryIndex, const int level) const; + bool putInternal(const uint32_t key, const uint64_t value, const uint32_t hashedKey, + const int bitmapEntryIndex, const Entry &bitmapEntry, const int level); + bool addNewEntryByResolvingConflict(const uint32_t key, const uint64_t value, + const uint32_t hashedKey, const Entry &conflictedEntry, const int conflictedEntryIndex, + const int level); + bool addNewEntryByExpandingTable(const uint32_t key, const uint64_t value, + const int tableIndex, const uint32_t bitmap, const int bitmapEntryIndex, + const int label); + const Result iterateNext(std::vector<TableIterationState> *const iterationState, + int *const outKey) const; + + AK_FORCE_INLINE const Entry readEntry(const int entryIndex) const { + return Entry(readField0(entryIndex), readField1(entryIndex)); + } + + // Returns whether an entry for the index is existing by testing if the index-th bit in the + // bitmap is set or not. + AK_FORCE_INLINE bool exists(const uint32_t bitmap, const int index) const { + return (bitmap & (1 << index)) != 0; + } + + // Set index-th bit in the bitmap. + AK_FORCE_INLINE uint32_t setExist(const uint32_t bitmap, const int index) const { + return bitmap | (1 << index); + } + + // Count set bits before index in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap, const int index) const { + return popCount(bitmap & ((1 << index) - 1)); + } + + // Count set bits in the bitmap. + AK_FORCE_INLINE int popCount(const uint32_t bitmap) const { + return __builtin_popcount(bitmap); + // int v = bitmap - ((bitmap >> 1) & 0x55555555); + // v = (v & 0x33333333) + ((v >> 2) & 0x33333333); + // return (((v + (v >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; + } + + AK_FORCE_INLINE int getLabel(const uint32_t hashedKey, const int level) const { + return (hashedKey >> (level * NUM_OF_BITS_USED_FOR_ONE_LEVEL)) & LABEL_MASK; + } + + AK_FORCE_INLINE uint32_t readField0(const int entryIndex) const { + return mBuffer.readUint(FIELD0_SIZE, ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE uint32_t readField1(const int entryIndex) const { + return mBuffer.readUint(FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE int readEmptyTableLink(const int entryCount) const { + return mBuffer.readUint(FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeEmptyTableLink(const int tableIndex, const int entryCount) { + return mBuffer.writeUint(tableIndex, FIELD1_SIZE, (entryCount - 1) * FIELD1_SIZE); + } + + AK_FORCE_INLINE bool writeField0(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD0_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE); + } + + AK_FORCE_INLINE bool writeField1(const uint32_t data, const int entryIndex) { + return mBuffer.writeUint(data, FIELD1_SIZE, + ROOT_BITMAP_ENTRY_POS + entryIndex * ENTRY_SIZE + FIELD0_SIZE); + } + + AK_FORCE_INLINE bool writeEntry(const Entry &entry, const int entryIndex) { + return writeField0(entry.mData0, entryIndex) && writeField1(entry.mData1, entryIndex); + } + + AK_FORCE_INLINE bool writeTerminalEntry(const uint32_t key, const uint64_t value, + const int entryIndex) { + return writeField0(key, entryIndex) && writeValue(value, entryIndex); + } + + AK_FORCE_INLINE bool copyEntry(const int originalEntryIndex, const int newEntryIndex) { + return writeEntry(readEntry(originalEntryIndex), newEntryIndex); + } + + AK_FORCE_INLINE int getTailEntryIndex() const { + return (mBuffer.getTailPosition() - ROOT_BITMAP_ENTRY_POS) / ENTRY_SIZE; + } + + bool removeInner(const Entry &bitmapEntry); +}; + +} // namespace latinime +#endif /* LATINIME_TRIE_MAP_H */ |