diff options
Diffstat (limited to 'native/jni/src/dictionary/utils/byte_array_utils.h')
-rw-r--r-- | native/jni/src/dictionary/utils/byte_array_utils.h | 290 |
1 files changed, 290 insertions, 0 deletions
diff --git a/native/jni/src/dictionary/utils/byte_array_utils.h b/native/jni/src/dictionary/utils/byte_array_utils.h new file mode 100644 index 000000000..abb979050 --- /dev/null +++ b/native/jni/src/dictionary/utils/byte_array_utils.h @@ -0,0 +1,290 @@ +/* + * Copyright (C) 2013, The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_BYTE_ARRAY_UTILS_H +#define LATINIME_BYTE_ARRAY_UTILS_H + +#include <cstdint> + +#include "defines.h" + +namespace latinime { + +/** + * Utility methods for reading byte arrays. + */ +class ByteArrayUtils { + public: + /** + * Integer writing + * + * Each method write a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE void writeUintAndAdvancePosition(uint8_t *const buffer, + const uint32_t data, const int size, int *const pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + ByteArrayUtils::writeUint8AndAdvancePosition(buffer, data, pos); + return; + case 2: + ByteArrayUtils::writeUint16AndAdvancePosition(buffer, data, pos); + return; + case 3: + ByteArrayUtils::writeUint24AndAdvancePosition(buffer, data, pos); + return; + case 4: + ByteArrayUtils::writeUint32AndAdvancePosition(buffer, data, pos); + return; + default: + break; + } + } + + /** + * Integer reading + * + * Each method read a corresponding size integer in a big endian manner. + */ + static AK_FORCE_INLINE uint32_t readUint32(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 24) ^ (buffer[pos + 1] << 16) + ^ (buffer[pos + 2] << 8) ^ buffer[pos + 3]; + } + + static AK_FORCE_INLINE uint32_t readUint24(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 16) ^ (buffer[pos + 1] << 8) ^ buffer[pos + 2]; + } + + static AK_FORCE_INLINE uint16_t readUint16(const uint8_t *const buffer, const int pos) { + return (buffer[pos] << 8) ^ buffer[pos + 1]; + } + + static AK_FORCE_INLINE uint8_t readUint8(const uint8_t *const buffer, const int pos) { + return buffer[pos]; + } + + static AK_FORCE_INLINE uint32_t readUint32AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint32(buffer, *pos); + *pos += 4; + return value; + } + + static AK_FORCE_INLINE int readSint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint8_t value = readUint8(buffer, *pos); + if (value < 0x80) { + return readUint24AndAdvancePosition(buffer, pos); + } else { + (*pos)++; + return -(((value & 0x7F) << 16) ^ readUint16AndAdvancePosition(buffer, pos)); + } + } + + static AK_FORCE_INLINE uint32_t readUint24AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint32_t value = readUint24(buffer, *pos); + *pos += 3; + return value; + } + + static AK_FORCE_INLINE uint16_t readUint16AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + const uint16_t value = readUint16(buffer, *pos); + *pos += 2; + return value; + } + + static AK_FORCE_INLINE uint8_t readUint8AndAdvancePosition( + const uint8_t *const buffer, int *const pos) { + return buffer[(*pos)++]; + } + + static AK_FORCE_INLINE uint32_t readUint(const uint8_t *const buffer, + const int size, const int pos) { + // size must be in 1 to 4. + ASSERT(size >= 1 && size <= 4); + switch (size) { + case 1: + return ByteArrayUtils::readUint8(buffer, pos); + case 2: + return ByteArrayUtils::readUint16(buffer, pos); + case 3: + return ByteArrayUtils::readUint24(buffer, pos); + case 4: + return ByteArrayUtils::readUint32(buffer, pos); + default: + return 0; + } + } + + /** + * Code Point Reading + * + * 1 byte = bbbbbbbb match + * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte + * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because + * unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with + * 00011111 would be outside unicode. + * else: iso-latin-1 code + * This allows for the whole unicode range to be encoded, including chars outside of + * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control + * characters which should never happen anyway (and still work, but take 3 bytes). + */ + static AK_FORCE_INLINE int readCodePoint(const uint8_t *const buffer, const int pos) { + int p = pos; + return readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, &p); + } + + static AK_FORCE_INLINE int readCodePointAndAdvancePosition( + const uint8_t *const buffer, const int *const codePointTable, int *const pos) { + /* + * codePointTable is an array to convert the most frequent characters in this dictionary to + * 1 byte code points. It is only made of the original code points of the most frequent + * characters used in this dictionary. 0x20 - 0xFF is used for the 1 byte characters. + * The original code points are restored by picking the code points at the indices of the + * codePointTable. The indices are calculated by subtracting 0x20 from the firstByte. + */ + const uint8_t firstByte = readUint8(buffer, *pos); + if (firstByte < MINIMUM_ONE_BYTE_CHARACTER_VALUE) { + if (firstByte == CHARACTER_ARRAY_TERMINATOR) { + *pos += 1; + return NOT_A_CODE_POINT; + } else { + return readUint24AndAdvancePosition(buffer, pos); + } + } else { + *pos += 1; + if (codePointTable) { + return codePointTable[firstByte - MINIMUM_ONE_BYTE_CHARACTER_VALUE]; + } + return firstByte; + } + } + + /** + * String (array of code points) Reading + * + * Reads code points until the terminator is found. + */ + // Returns the length of the string. + static int readStringAndAdvancePosition(const uint8_t *const buffer, + const int maxLength, const int *const codePointTable, int *const outBuffer, + int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + outBuffer[length++] = codePoint; + codePoint = readCodePointAndAdvancePosition(buffer, codePointTable, pos); + } + return length; + } + + // Advances the position and returns the length of the string. + static int advancePositionToBehindString( + const uint8_t *const buffer, const int maxLength, int *const pos) { + int length = 0; + int codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + while (NOT_A_CODE_POINT != codePoint && length < maxLength) { + codePoint = readCodePointAndAdvancePosition(buffer, nullptr /* codePointTable */, pos); + length++; + } + return length; + } + + /** + * String (array of code points) Writing + */ + static void writeCodePointsAndAdvancePosition(uint8_t *const buffer, + const int *const codePoints, const int codePointCount, const bool writesTerminator, + int *const pos) { + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + writeUint24AndAdvancePosition(buffer, codePoint, pos); + } else { + // one byte character. + writeUint8AndAdvancePosition(buffer, codePoint, pos); + } + } + if (writesTerminator) { + writeUint8AndAdvancePosition(buffer, CHARACTER_ARRAY_TERMINATOR, pos); + } + } + + static int calculateRequiredByteCountToStoreCodePoints(const int *const codePoints, + const int codePointCount, const bool writesTerminator) { + int byteCount = 0; + for (int i = 0; i < codePointCount; ++i) { + const int codePoint = codePoints[i]; + if (codePoint == NOT_A_CODE_POINT || codePoint == CHARACTER_ARRAY_TERMINATOR) { + break; + } else if (codePoint < MINIMUM_ONE_BYTE_CHARACTER_VALUE + || codePoint > MAXIMUM_ONE_BYTE_CHARACTER_VALUE) { + // three bytes character. + byteCount += 3; + } else { + // one byte character. + byteCount += 1; + } + } + if (writesTerminator) { + // The terminator is one byte. + byteCount += 1; + } + return byteCount; + } + + private: + DISALLOW_IMPLICIT_CONSTRUCTORS(ByteArrayUtils); + + static const uint8_t MINIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t MAXIMUM_ONE_BYTE_CHARACTER_VALUE; + static const uint8_t CHARACTER_ARRAY_TERMINATOR; + + static AK_FORCE_INLINE void writeUint32AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 24) & 0xFF; + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint24AndAdvancePosition(uint8_t *const buffer, + const uint32_t data, int *const pos) { + buffer[(*pos)++] = (data >> 16) & 0xFF; + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint16AndAdvancePosition(uint8_t *const buffer, + const uint16_t data, int *const pos) { + buffer[(*pos)++] = (data >> 8) & 0xFF; + buffer[(*pos)++] = data & 0xFF; + } + + static AK_FORCE_INLINE void writeUint8AndAdvancePosition(uint8_t *const buffer, + const uint8_t data, int *const pos) { + buffer[(*pos)++] = data & 0xFF; + } +}; +} // namespace latinime +#endif /* LATINIME_BYTE_ARRAY_UTILS_H */ |