diff options
17 files changed, 379 insertions, 64 deletions
diff --git a/common/src/com/android/inputmethod/latin/common/LocaleUtils.java b/common/src/com/android/inputmethod/latin/common/LocaleUtils.java index 14b3d220d..7f2333be5 100644 --- a/common/src/com/android/inputmethod/latin/common/LocaleUtils.java +++ b/common/src/com/android/inputmethod/latin/common/LocaleUtils.java @@ -17,8 +17,12 @@ package com.android.inputmethod.latin.common; import java.util.HashMap; +import java.util.HashSet; import java.util.Locale; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; + /** * A class to help with handling Locales in string form. * @@ -160,26 +164,49 @@ public final class LocaleUtils { /** * Creates a locale from a string specification. + * @param localeString a string specification of a locale, in a format of "ll_cc_variant" where + * "ll" is a language code, "cc" is a country code. */ - public static Locale constructLocaleFromString(final String localeStr) { - if (localeStr == null) + @Nullable + public static Locale constructLocaleFromString(@Nullable final String localeString) { + if (localeString == null) { return null; + } synchronized (sLocaleCache) { - if (sLocaleCache.containsKey(localeStr)) - return sLocaleCache.get(localeStr); - Locale retval = null; - String[] localeParams = localeStr.split("_", 3); - if (localeParams.length == 1) { - retval = new Locale(localeParams[0]); - } else if (localeParams.length == 2) { - retval = new Locale(localeParams[0], localeParams[1]); - } else if (localeParams.length == 3) { - retval = new Locale(localeParams[0], localeParams[1], localeParams[2]); + if (sLocaleCache.containsKey(localeString)) { + return sLocaleCache.get(localeString); } - if (retval != null) { - sLocaleCache.put(localeStr, retval); + final String[] elements = localeString.split("_", 3); + final Locale locale; + if (elements.length == 1) { + locale = new Locale(elements[0] /* language */); + } else if (elements.length == 2) { + locale = new Locale(elements[0] /* language */, elements[1] /* country */); + } else { // localeParams.length == 3 + locale = new Locale(elements[0] /* language */, elements[1] /* country */, + elements[2] /* variant */); } - return retval; + sLocaleCache.put(localeString, locale); + return locale; } } + + // TODO: Get this information from the framework instead of maintaining here by ourselves. + private static final HashSet<String> sRtlLanguageCodes = new HashSet<>(); + static { + // List of known Right-To-Left language codes. + sRtlLanguageCodes.add("ar"); // Arabic + sRtlLanguageCodes.add("fa"); // Persian + sRtlLanguageCodes.add("iw"); // Hebrew + sRtlLanguageCodes.add("ku"); // Kurdish + sRtlLanguageCodes.add("ps"); // Pashto + sRtlLanguageCodes.add("sd"); // Sindhi + sRtlLanguageCodes.add("ug"); // Uyghur + sRtlLanguageCodes.add("ur"); // Urdu + sRtlLanguageCodes.add("yi"); // Yiddish + } + + public static boolean isRtlLanguage(@Nonnull final Locale locale) { + return sRtlLanguageCodes.contains(locale.getLanguage()); + } } diff --git a/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java b/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java index 51f37fdc6..b1051385d 100644 --- a/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java +++ b/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java @@ -294,7 +294,7 @@ public final class KeyboardLayoutSet { : subtype; mParams.mSubtype = keyboardSubtype; mParams.mKeyboardLayoutSetName = KEYBOARD_LAYOUT_SET_RESOURCE_PREFIX - + SubtypeLocaleUtils.getKeyboardLayoutSetName(keyboardSubtype); + + keyboardSubtype.getKeyboardLayoutSetName(); return this; } diff --git a/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java b/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java index c739bf3e0..51f89c122 100644 --- a/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java +++ b/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java @@ -36,7 +36,6 @@ import com.android.inputmethod.latin.R; import com.android.inputmethod.latin.common.Constants; import com.android.inputmethod.latin.common.StringUtils; import com.android.inputmethod.latin.utils.ResourceUtils; -import com.android.inputmethod.latin.utils.SubtypeLocaleUtils; import com.android.inputmethod.latin.utils.XmlParseUtils; import com.android.inputmethod.latin.utils.XmlParseUtils.ParseException; @@ -648,7 +647,7 @@ public class KeyboardBuilder<KP extends KeyboardParams> { try { final boolean keyboardLayoutSetMatched = matchString(caseAttr, R.styleable.Keyboard_Case_keyboardLayoutSet, - SubtypeLocaleUtils.getKeyboardLayoutSetName(id.mSubtype)); + id.mSubtype.getKeyboardLayoutSetName()); final boolean keyboardLayoutSetElementMatched = matchTypedValue(caseAttr, R.styleable.Keyboard_Case_keyboardLayoutSetElement, id.mElementId, KeyboardId.elementIdToName(id.mElementId)); diff --git a/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java b/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java index 2a70ef51a..8ed80107a 100644 --- a/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java +++ b/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java @@ -52,7 +52,7 @@ public final class LanguageOnSpacebarHelper { return FORMAT_TYPE_MULTIPLE; } final String keyboardLanguage = locales[0].getLanguage(); - final String keyboardLayout = SubtypeLocaleUtils.getKeyboardLayoutSetName(subtype); + final String keyboardLayout = subtype.getKeyboardLayoutSetName(); int sameLanguageAndLayoutCount = 0; for (final InputMethodSubtype ims : mEnabledSubtypes) { final String language = SubtypeLocaleUtils.getSubtypeLocale(ims).getLanguage(); diff --git a/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java b/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java index 03f6d60ab..ea8d4a210 100644 --- a/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java +++ b/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java @@ -135,13 +135,18 @@ public final class RichInputMethodSubtype { public boolean isRtlSubtype() { // The subtype is considered RTL if the language of the main subtype is RTL. - return SubtypeLocaleUtils.isRtlLanguage(mLocales[0]); + return LocaleUtils.isRtlLanguage(mLocales[0]); } // TODO: remove this method @Nonnull public InputMethodSubtype getRawSubtype() { return mSubtype; } + @Nonnull + public String getKeyboardLayoutSetName() { + return SubtypeLocaleUtils.getKeyboardLayoutSetName(mSubtype); + } + // Dummy no language QWERTY subtype. See {@link R.xml.method}. private static final int SUBTYPE_ID_OF_DUMMY_NO_LANGUAGE_SUBTYPE = 0xdde0bfd3; private static final String EXTRA_VALUE_OF_DUMMY_NO_LANGUAGE_SUBTYPE = diff --git a/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java b/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java index 01398f467..b749aa51a 100644 --- a/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java +++ b/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java @@ -346,8 +346,10 @@ final class CustomInputStylePreference extends DialogPreference super(context, android.R.layout.simple_spinner_item); setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item); + final String[] predefinedKeyboardLayoutSet = context.getResources().getStringArray( + R.array.predefined_layouts); // TODO: Should filter out already existing combinations of locale and layout. - for (final String layout : SubtypeLocaleUtils.getPredefinedKeyboardLayoutSet()) { + for (final String layout : predefinedKeyboardLayoutSet) { // This is a dummy subtype with NO_LANGUAGE, only for display. final InputMethodSubtype subtype = AdditionalSubtypeUtils.createDummyAdditionalSubtype( diff --git a/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java b/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java index 27a0f62ff..7991a2473 100644 --- a/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java +++ b/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java @@ -50,10 +50,10 @@ import com.android.inputmethod.latin.PunctuationSuggestions; import com.android.inputmethod.latin.R; import com.android.inputmethod.latin.SuggestedWords; import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo; +import com.android.inputmethod.latin.common.LocaleUtils; import com.android.inputmethod.latin.settings.Settings; import com.android.inputmethod.latin.settings.SettingsValues; import com.android.inputmethod.latin.utils.ResourceUtils; -import com.android.inputmethod.latin.utils.SubtypeLocaleUtils; import com.android.inputmethod.latin.utils.ViewLayoutUtils; import java.util.ArrayList; @@ -570,8 +570,7 @@ final class SuggestionStripLayoutHelper { final boolean isRtlLanguage = (ViewCompat.getLayoutDirection(addToDictionaryStrip) == ViewCompat.LAYOUT_DIRECTION_RTL); final String arrow = isRtlLanguage ? RIGHTWARDS_ARROW : LEFTWARDS_ARROW; - final boolean isRtlSystem = SubtypeLocaleUtils.isRtlLanguage( - res.getConfiguration().locale); + final boolean isRtlSystem = LocaleUtils.isRtlLanguage(res.getConfiguration().locale); final CharSequence hint = res.getText(R.string.hint_add_to_dictionary); hintText = (isRtlLanguage == isRtlSystem) ? (arrow + hint) : (hint + arrow); hintWidth = width - wordWidth; diff --git a/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java b/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java index b36168b6c..013f024c0 100644 --- a/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java +++ b/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java @@ -27,11 +27,9 @@ import android.util.Log; import android.view.inputmethod.InputMethodSubtype; import com.android.inputmethod.latin.R; -import com.android.inputmethod.latin.RichInputMethodSubtype; import com.android.inputmethod.latin.common.LocaleUtils; import com.android.inputmethod.latin.common.StringUtils; -import java.util.Arrays; import java.util.HashMap; import java.util.Locale; @@ -56,7 +54,6 @@ public final class SubtypeLocaleUtils { private static volatile boolean sInitialized = false; private static final Object sInitializeLock = new Object(); private static Resources sResources; - private static String[] sPredefinedKeyboardLayoutSet; // Keyboard layout to its display name map. private static final HashMap<String, String> sKeyboardLayoutToDisplayNameMap = new HashMap<>(); // Keyboard layout to subtype name resource id map. @@ -103,7 +100,6 @@ public final class SubtypeLocaleUtils { sResources = res; final String[] predefinedLayoutSet = res.getStringArray(R.array.predefined_layouts); - sPredefinedKeyboardLayoutSet = predefinedLayoutSet; final String[] layoutDisplayNames = res.getStringArray( R.array.predefined_layout_display_names); for (int i = 0; i < predefinedLayoutSet.length; i++) { @@ -152,10 +148,6 @@ public final class SubtypeLocaleUtils { } } - public static String[] getPredefinedKeyboardLayoutSet() { - return sPredefinedKeyboardLayoutSet; - } - public static boolean isExceptionalLocale(final String localeString) { return sExceptionalLocaleToNameIdsMap.containsKey(localeString); } @@ -334,10 +326,6 @@ public final class SubtypeLocaleUtils { } @Nonnull - public static String getKeyboardLayoutSetName(@Nonnull final RichInputMethodSubtype subtype) { - return getKeyboardLayoutSetName(subtype.getRawSubtype()); - } - public static String getKeyboardLayoutSetName(final InputMethodSubtype subtype) { String keyboardLayoutSet = subtype.getExtraValueOf(KEYBOARD_LAYOUT_SET); if (keyboardLayoutSet == null) { @@ -357,22 +345,6 @@ public final class SubtypeLocaleUtils { return keyboardLayoutSet; } - // TODO: Get this information from the framework instead of maintaining here by ourselves. - // Sorted list of known Right-To-Left language codes. - private static final String[] SORTED_RTL_LANGUAGES = { - "ar", // Arabic - "fa", // Persian - "iw", // Hebrew - }; - static { - Arrays.sort(SORTED_RTL_LANGUAGES); - } - - public static boolean isRtlLanguage(final Locale locale) { - final String language = locale.getLanguage(); - return Arrays.binarySearch(SORTED_RTL_LANGUAGES, language) >= 0; - } - public static String getCombiningRulesExtraValue(final InputMethodSubtype subtype) { return subtype.getExtraValueOf(COMBINING_RULES); } diff --git a/native/dicttoolkit/NativeFileList.mk b/native/dicttoolkit/NativeFileList.mk index b39a24890..1c004f73a 100644 --- a/native/dicttoolkit/NativeFileList.mk +++ b/native/dicttoolkit/NativeFileList.mk @@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \ makedict_executor.cpp) \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict.cpp) \ - utils/command_utils.cpp + $(addprefix utils/, \ + command_utils.cpp \ + utf8_utils.cpp) LATIN_IME_DICT_TOOLKIT_TEST_FILES := \ dict_toolkit_defines_test.cpp \ $(addprefix offdevice_intermediate_dict/, \ offdevice_intermediate_dict_test.cpp) \ $(addprefix utils/, \ - command_utils_test.cpp) + command_utils_test.cpp \ + utf8_utils_test.cpp) diff --git a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h index dfdb6a897..13d26ba91 100644 --- a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h +++ b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h @@ -18,6 +18,7 @@ #define LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_H #include "dict_toolkit_defines.h" +#include "offdevice_intermediate_dict/offdevice_intermediate_dict_header.h" #include "offdevice_intermediate_dict/offdevice_intermediate_dict_pt_node_array.h" #include "suggest/core/dictionary/property/word_property.h" #include "utils/int_array_view.h" @@ -30,13 +31,18 @@ namespace dicttoolkit { */ class OffdeviceIntermediateDict final { public: + OffdeviceIntermediateDict(const OffdeviceIntermediateDictHeader &header) + : mHeader(header), mRootPtNodeArray() {} + bool addWord(const WordProperty &wordProperty); // The returned value will be invalid after modifying the dictionary. e.g. calling addWord(). const WordProperty *getWordProperty(const CodePointArrayView codePoints) const; + const OffdeviceIntermediateDictHeader &getHeader() const { return mHeader; } private: DISALLOW_ASSIGNMENT_OPERATOR(OffdeviceIntermediateDict); + const OffdeviceIntermediateDictHeader mHeader; OffdeviceIntermediateDictPtNodeArray mRootPtNodeArray; bool addWordInner(const CodePointArrayView codePoints, const WordProperty &wordProperty, diff --git a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h new file mode 100644 index 000000000..440627a79 --- /dev/null +++ b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H +#define LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H + +#include <map> +#include <vector> + +#include "dict_toolkit_defines.h" + +namespace latinime { +namespace dicttoolkit { + +class OffdeviceIntermediateDictHeader final { + public: + using AttributeMap = std::map<std::vector<int>, std::vector<int>>; + + OffdeviceIntermediateDictHeader(const AttributeMap &attributesMap) + : mAttributeMap(attributesMap) {} + + private: + DISALLOW_DEFAULT_CONSTRUCTOR(OffdeviceIntermediateDictHeader); + DISALLOW_ASSIGNMENT_OPERATOR(OffdeviceIntermediateDictHeader); + + const AttributeMap mAttributeMap; +}; + +} // namespace dicttoolkit +} // namespace latinime +#endif // LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H diff --git a/native/dicttoolkit/src/utils/utf8_utils.cpp b/native/dicttoolkit/src/utils/utf8_utils.cpp new file mode 100644 index 000000000..0f349f512 --- /dev/null +++ b/native/dicttoolkit/src/utils/utf8_utils.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/utf8_utils.h" + +#include "utils/char_utils.h" + +namespace latinime { +namespace dicttoolkit { + +const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4; +const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8}; +const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0}; +const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03}; +const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; + +const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F; +const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80; +const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6; + +/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) { + std::vector<int> codePoints; + int remainingByteCountForCurrentCodePoint = 0; + int currentCodePointSequenceSize = 0; + int codePoint = 0; + for (const char c : utf8Str) { + if (remainingByteCountForCurrentCodePoint == 0) { + currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c); + if (currentCodePointSequenceSize <= 0) { + AKLOGE("%x is an invalid utf8 first byte value.", c); + return std::vector<int>(); + } + remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize; + codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint); + } else { + codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + codePoint += maskTrailingByte(c); + } + remainingByteCountForCurrentCodePoint--; + if (remainingByteCountForCurrentCodePoint == 0) { + if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) { + AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.", + currentCodePointSequenceSize, codePoint); + return std::vector<int>(); + } + codePoints.push_back(codePoint); + } + } + return codePoints; +} + +/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) { + for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { + if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) { + return i; + } + } + // Not a valid utf8 char first byte. + return -1; +} + +/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte, + const int sequenceSize) { + return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize]; +} + +/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) { + return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK; +} + +/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) { + std::string utf8String; + for (const int codePoint : codePoints) { + const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint); + if (sequenceSize <= 0) { + AKLOGE("Cannot encode code point (%d).", codePoint); + return std::string(); + } + const int trailingByteCount = sequenceSize - 1; + // Output first byte. + const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE); + utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize])); + // Output second and later bytes. + for (int i = 1; i < sequenceSize; ++i) { + const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK; + utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER)); + } + } + return utf8String; +} + +/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) { + if (codePoint < 0) { + return -1; + } + for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) { + if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) { + return i; + } + } + return -1; +} + +} // namespace dicttoolkit +} // namespace latinime diff --git a/native/dicttoolkit/src/utils/utf8_utils.h b/native/dicttoolkit/src/utils/utf8_utils.h new file mode 100644 index 000000000..35818e56c --- /dev/null +++ b/native/dicttoolkit/src/utils/utf8_utils.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H +#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H + +#include <cstdint> +#include <string> +#include <vector> + +#include "dict_toolkit_defines.h" +#include "utils/int_array_view.h" + +namespace latinime { +namespace dicttoolkit { + +class Utf8Utils { +public: + static std::vector<int> getCodePoints(const std::string &utf8Str); + static std::string getUtf8String(const CodePointArrayView codePoints); + +private: + DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils); + + // Values indexed by sequence size. + static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; + static const uint8_t FIRST_BYTE_MARKER_MASKS[]; + static const uint8_t FIRST_BYTE_MARKERS[]; + static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[]; + static const int MAX_ENCODED_CODE_POINT_VALUES[]; + + static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK; + static const uint8_t TRAILING_BYTE_MARKER; + static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE; + + static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte); + static int maskFirstByte(const uint8_t firstByte, const int encodeSize); + static int maskTrailingByte(const uint8_t secondOrLaterByte); + static int getSequenceSizeToEncodeCodePoint(const int codePoint); +}; +} // namespace dicttoolkit +} // namespace latinime +#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H diff --git a/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp b/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp index 3bcb89e46..f2e24ab5f 100644 --- a/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp +++ b/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp @@ -41,7 +41,8 @@ const WordProperty getDummpWordProperty(const std::vector<int> &&codePoints) { } TEST(OffdeviceIntermediateDictTest, TestAddWordProperties) { - OffdeviceIntermediateDict dict; + OffdeviceIntermediateDict dict = OffdeviceIntermediateDict( + OffdeviceIntermediateDictHeader(OffdeviceIntermediateDictHeader::AttributeMap())); EXPECT_EQ(nullptr, dict.getWordProperty(CodePointArrayView())); const WordProperty wordProperty0 = getDummpWordProperty(getCodePointVector("abcd")); diff --git a/native/dicttoolkit/tests/utils/utf8_utils_test.cpp b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp new file mode 100644 index 000000000..9c59a8b05 --- /dev/null +++ b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utils/utf8_utils.h" + +#include <gtest/gtest.h> + +#include <vector> + +#include "utils/int_array_view.h" + +namespace latinime { +namespace dicttoolkit { +namespace { + +TEST(Utf8UtilsTests, TestGetCodePoints) { + { + const std::vector<int> codePoints = Utf8Utils::getCodePoints(""); + EXPECT_EQ(0u, codePoints.size()); + } + { + const std::vector<int> codePoints = Utf8Utils::getCodePoints("test"); + EXPECT_EQ(4u, codePoints.size()); + EXPECT_EQ('t', codePoints[0]); + EXPECT_EQ('e', codePoints[1]); + EXPECT_EQ('s', codePoints[2]); + EXPECT_EQ('t', codePoints[3]); + } + { + const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410"); + EXPECT_EQ(4u, codePoints.size()); + EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A + EXPECT_EQ('a', codePoints[1]); + EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A + EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA + } + { + const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752"); + EXPECT_EQ(3u, codePoints.size()); + EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE + EXPECT_EQ('?', codePoints[1]); + EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT + } + + // Redundant UTF-8 sequences must be rejected. + EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty()); + EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty()); + EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty()); +} + +TEST(Utf8UtilsTests, TestGetUtf8String) { + { + const std::vector<int> codePoints = {'t', 'e', 's', 't'}; + EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); + } + { + const std::vector<int> codePoints = { + 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */, + 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */, + 0x0430 /* CYRILLIC SMALL LETTER A */, + 0x3042 /* HIRAGANA LETTER A */, + 0x1F36A /* COOKIE */, + 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */ + }; + EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752", + Utf8Utils::getUtf8String(CodePointArrayView(codePoints))); + } +} + +} // namespace +} // namespace dicttoolkit +} // namespace latinime diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java index 6c60fdc0c..d833b9736 100644 --- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java +++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java @@ -26,7 +26,6 @@ import com.android.inputmethod.latin.BinaryDictionary; import com.android.inputmethod.latin.common.CodePointUtils; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding; import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer; -import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode; import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray; diff --git a/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java b/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java index 54f478f5a..03dcdfc78 100644 --- a/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java +++ b/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java @@ -434,8 +434,8 @@ public class SubtypeLocaleUtilsTests extends AndroidTestCase { // locale layout | display name // ------ -------------- - ---------------------- // sr south_slavic F Српски - // sr_ZZ serbian_qwertz F српски (латиница) - // sr_ZZ qwerty T српски (QWERTY) + // sr_ZZ serbian_qwertz F Српски (латиница) + // sr_ZZ qwerty T Српски (QWERTY) public void testSerbianLatinSubtypesInSerbianSystemLocale() { final RunInLocale<Void> tests = new RunInLocale<Void>() { @@ -445,12 +445,10 @@ public class SubtypeLocaleUtilsTests extends AndroidTestCase { SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR)); // These are preliminary subtypes and may not exist. if (SR_LATN != null) { - // TODO: Uncommented because of the current translation of these strings - // in Seriban are described in Latin script. -// assertEquals("sr_ZZ", "српски (латиница)", -// SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN)); -// assertEquals("sr_ZZ", "српски (QWERTY)", -// SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN_QWERTY)); + assertEquals("sr_ZZ", "Српски (латиница)", + SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN)); + assertEquals("sr_ZZ", "Српски (QWERTY)", + SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN_QWERTY)); } return null; } |