aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--common/src/com/android/inputmethod/latin/common/LocaleUtils.java57
-rw-r--r--java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java2
-rw-r--r--java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java3
-rw-r--r--java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java2
-rw-r--r--java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java7
-rw-r--r--java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java4
-rw-r--r--java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java5
-rw-r--r--java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java28
-rw-r--r--native/dicttoolkit/NativeFileList.mk7
-rw-r--r--native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h6
-rw-r--r--native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h44
-rw-r--r--native/dicttoolkit/src/utils/utf8_utils.cpp119
-rw-r--r--native/dicttoolkit/src/utils/utf8_utils.h56
-rw-r--r--native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp3
-rw-r--r--native/dicttoolkit/tests/utils/utf8_utils_test.cpp85
-rw-r--r--tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java1
-rw-r--r--tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java14
17 files changed, 379 insertions, 64 deletions
diff --git a/common/src/com/android/inputmethod/latin/common/LocaleUtils.java b/common/src/com/android/inputmethod/latin/common/LocaleUtils.java
index 14b3d220d..7f2333be5 100644
--- a/common/src/com/android/inputmethod/latin/common/LocaleUtils.java
+++ b/common/src/com/android/inputmethod/latin/common/LocaleUtils.java
@@ -17,8 +17,12 @@
package com.android.inputmethod.latin.common;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Locale;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
/**
* A class to help with handling Locales in string form.
*
@@ -160,26 +164,49 @@ public final class LocaleUtils {
/**
* Creates a locale from a string specification.
+ * @param localeString a string specification of a locale, in a format of "ll_cc_variant" where
+ * "ll" is a language code, "cc" is a country code.
*/
- public static Locale constructLocaleFromString(final String localeStr) {
- if (localeStr == null)
+ @Nullable
+ public static Locale constructLocaleFromString(@Nullable final String localeString) {
+ if (localeString == null) {
return null;
+ }
synchronized (sLocaleCache) {
- if (sLocaleCache.containsKey(localeStr))
- return sLocaleCache.get(localeStr);
- Locale retval = null;
- String[] localeParams = localeStr.split("_", 3);
- if (localeParams.length == 1) {
- retval = new Locale(localeParams[0]);
- } else if (localeParams.length == 2) {
- retval = new Locale(localeParams[0], localeParams[1]);
- } else if (localeParams.length == 3) {
- retval = new Locale(localeParams[0], localeParams[1], localeParams[2]);
+ if (sLocaleCache.containsKey(localeString)) {
+ return sLocaleCache.get(localeString);
}
- if (retval != null) {
- sLocaleCache.put(localeStr, retval);
+ final String[] elements = localeString.split("_", 3);
+ final Locale locale;
+ if (elements.length == 1) {
+ locale = new Locale(elements[0] /* language */);
+ } else if (elements.length == 2) {
+ locale = new Locale(elements[0] /* language */, elements[1] /* country */);
+ } else { // localeParams.length == 3
+ locale = new Locale(elements[0] /* language */, elements[1] /* country */,
+ elements[2] /* variant */);
}
- return retval;
+ sLocaleCache.put(localeString, locale);
+ return locale;
}
}
+
+ // TODO: Get this information from the framework instead of maintaining here by ourselves.
+ private static final HashSet<String> sRtlLanguageCodes = new HashSet<>();
+ static {
+ // List of known Right-To-Left language codes.
+ sRtlLanguageCodes.add("ar"); // Arabic
+ sRtlLanguageCodes.add("fa"); // Persian
+ sRtlLanguageCodes.add("iw"); // Hebrew
+ sRtlLanguageCodes.add("ku"); // Kurdish
+ sRtlLanguageCodes.add("ps"); // Pashto
+ sRtlLanguageCodes.add("sd"); // Sindhi
+ sRtlLanguageCodes.add("ug"); // Uyghur
+ sRtlLanguageCodes.add("ur"); // Urdu
+ sRtlLanguageCodes.add("yi"); // Yiddish
+ }
+
+ public static boolean isRtlLanguage(@Nonnull final Locale locale) {
+ return sRtlLanguageCodes.contains(locale.getLanguage());
+ }
}
diff --git a/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java b/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java
index 51f37fdc6..b1051385d 100644
--- a/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java
+++ b/java/src/com/android/inputmethod/keyboard/KeyboardLayoutSet.java
@@ -294,7 +294,7 @@ public final class KeyboardLayoutSet {
: subtype;
mParams.mSubtype = keyboardSubtype;
mParams.mKeyboardLayoutSetName = KEYBOARD_LAYOUT_SET_RESOURCE_PREFIX
- + SubtypeLocaleUtils.getKeyboardLayoutSetName(keyboardSubtype);
+ + keyboardSubtype.getKeyboardLayoutSetName();
return this;
}
diff --git a/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java b/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java
index c739bf3e0..51f89c122 100644
--- a/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java
+++ b/java/src/com/android/inputmethod/keyboard/internal/KeyboardBuilder.java
@@ -36,7 +36,6 @@ import com.android.inputmethod.latin.R;
import com.android.inputmethod.latin.common.Constants;
import com.android.inputmethod.latin.common.StringUtils;
import com.android.inputmethod.latin.utils.ResourceUtils;
-import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
import com.android.inputmethod.latin.utils.XmlParseUtils;
import com.android.inputmethod.latin.utils.XmlParseUtils.ParseException;
@@ -648,7 +647,7 @@ public class KeyboardBuilder<KP extends KeyboardParams> {
try {
final boolean keyboardLayoutSetMatched = matchString(caseAttr,
R.styleable.Keyboard_Case_keyboardLayoutSet,
- SubtypeLocaleUtils.getKeyboardLayoutSetName(id.mSubtype));
+ id.mSubtype.getKeyboardLayoutSetName());
final boolean keyboardLayoutSetElementMatched = matchTypedValue(caseAttr,
R.styleable.Keyboard_Case_keyboardLayoutSetElement, id.mElementId,
KeyboardId.elementIdToName(id.mElementId));
diff --git a/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java b/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java
index 2a70ef51a..8ed80107a 100644
--- a/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java
+++ b/java/src/com/android/inputmethod/keyboard/internal/LanguageOnSpacebarHelper.java
@@ -52,7 +52,7 @@ public final class LanguageOnSpacebarHelper {
return FORMAT_TYPE_MULTIPLE;
}
final String keyboardLanguage = locales[0].getLanguage();
- final String keyboardLayout = SubtypeLocaleUtils.getKeyboardLayoutSetName(subtype);
+ final String keyboardLayout = subtype.getKeyboardLayoutSetName();
int sameLanguageAndLayoutCount = 0;
for (final InputMethodSubtype ims : mEnabledSubtypes) {
final String language = SubtypeLocaleUtils.getSubtypeLocale(ims).getLanguage();
diff --git a/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java b/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java
index 03f6d60ab..ea8d4a210 100644
--- a/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java
+++ b/java/src/com/android/inputmethod/latin/RichInputMethodSubtype.java
@@ -135,13 +135,18 @@ public final class RichInputMethodSubtype {
public boolean isRtlSubtype() {
// The subtype is considered RTL if the language of the main subtype is RTL.
- return SubtypeLocaleUtils.isRtlLanguage(mLocales[0]);
+ return LocaleUtils.isRtlLanguage(mLocales[0]);
}
// TODO: remove this method
@Nonnull
public InputMethodSubtype getRawSubtype() { return mSubtype; }
+ @Nonnull
+ public String getKeyboardLayoutSetName() {
+ return SubtypeLocaleUtils.getKeyboardLayoutSetName(mSubtype);
+ }
+
// Dummy no language QWERTY subtype. See {@link R.xml.method}.
private static final int SUBTYPE_ID_OF_DUMMY_NO_LANGUAGE_SUBTYPE = 0xdde0bfd3;
private static final String EXTRA_VALUE_OF_DUMMY_NO_LANGUAGE_SUBTYPE =
diff --git a/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java b/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java
index 01398f467..b749aa51a 100644
--- a/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java
+++ b/java/src/com/android/inputmethod/latin/settings/CustomInputStylePreference.java
@@ -346,8 +346,10 @@ final class CustomInputStylePreference extends DialogPreference
super(context, android.R.layout.simple_spinner_item);
setDropDownViewResource(android.R.layout.simple_spinner_dropdown_item);
+ final String[] predefinedKeyboardLayoutSet = context.getResources().getStringArray(
+ R.array.predefined_layouts);
// TODO: Should filter out already existing combinations of locale and layout.
- for (final String layout : SubtypeLocaleUtils.getPredefinedKeyboardLayoutSet()) {
+ for (final String layout : predefinedKeyboardLayoutSet) {
// This is a dummy subtype with NO_LANGUAGE, only for display.
final InputMethodSubtype subtype =
AdditionalSubtypeUtils.createDummyAdditionalSubtype(
diff --git a/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java b/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java
index 27a0f62ff..7991a2473 100644
--- a/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java
+++ b/java/src/com/android/inputmethod/latin/suggestions/SuggestionStripLayoutHelper.java
@@ -50,10 +50,10 @@ import com.android.inputmethod.latin.PunctuationSuggestions;
import com.android.inputmethod.latin.R;
import com.android.inputmethod.latin.SuggestedWords;
import com.android.inputmethod.latin.SuggestedWords.SuggestedWordInfo;
+import com.android.inputmethod.latin.common.LocaleUtils;
import com.android.inputmethod.latin.settings.Settings;
import com.android.inputmethod.latin.settings.SettingsValues;
import com.android.inputmethod.latin.utils.ResourceUtils;
-import com.android.inputmethod.latin.utils.SubtypeLocaleUtils;
import com.android.inputmethod.latin.utils.ViewLayoutUtils;
import java.util.ArrayList;
@@ -570,8 +570,7 @@ final class SuggestionStripLayoutHelper {
final boolean isRtlLanguage = (ViewCompat.getLayoutDirection(addToDictionaryStrip)
== ViewCompat.LAYOUT_DIRECTION_RTL);
final String arrow = isRtlLanguage ? RIGHTWARDS_ARROW : LEFTWARDS_ARROW;
- final boolean isRtlSystem = SubtypeLocaleUtils.isRtlLanguage(
- res.getConfiguration().locale);
+ final boolean isRtlSystem = LocaleUtils.isRtlLanguage(res.getConfiguration().locale);
final CharSequence hint = res.getText(R.string.hint_add_to_dictionary);
hintText = (isRtlLanguage == isRtlSystem) ? (arrow + hint) : (hint + arrow);
hintWidth = width - wordWidth;
diff --git a/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java b/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java
index b36168b6c..013f024c0 100644
--- a/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java
+++ b/java/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtils.java
@@ -27,11 +27,9 @@ import android.util.Log;
import android.view.inputmethod.InputMethodSubtype;
import com.android.inputmethod.latin.R;
-import com.android.inputmethod.latin.RichInputMethodSubtype;
import com.android.inputmethod.latin.common.LocaleUtils;
import com.android.inputmethod.latin.common.StringUtils;
-import java.util.Arrays;
import java.util.HashMap;
import java.util.Locale;
@@ -56,7 +54,6 @@ public final class SubtypeLocaleUtils {
private static volatile boolean sInitialized = false;
private static final Object sInitializeLock = new Object();
private static Resources sResources;
- private static String[] sPredefinedKeyboardLayoutSet;
// Keyboard layout to its display name map.
private static final HashMap<String, String> sKeyboardLayoutToDisplayNameMap = new HashMap<>();
// Keyboard layout to subtype name resource id map.
@@ -103,7 +100,6 @@ public final class SubtypeLocaleUtils {
sResources = res;
final String[] predefinedLayoutSet = res.getStringArray(R.array.predefined_layouts);
- sPredefinedKeyboardLayoutSet = predefinedLayoutSet;
final String[] layoutDisplayNames = res.getStringArray(
R.array.predefined_layout_display_names);
for (int i = 0; i < predefinedLayoutSet.length; i++) {
@@ -152,10 +148,6 @@ public final class SubtypeLocaleUtils {
}
}
- public static String[] getPredefinedKeyboardLayoutSet() {
- return sPredefinedKeyboardLayoutSet;
- }
-
public static boolean isExceptionalLocale(final String localeString) {
return sExceptionalLocaleToNameIdsMap.containsKey(localeString);
}
@@ -334,10 +326,6 @@ public final class SubtypeLocaleUtils {
}
@Nonnull
- public static String getKeyboardLayoutSetName(@Nonnull final RichInputMethodSubtype subtype) {
- return getKeyboardLayoutSetName(subtype.getRawSubtype());
- }
-
public static String getKeyboardLayoutSetName(final InputMethodSubtype subtype) {
String keyboardLayoutSet = subtype.getExtraValueOf(KEYBOARD_LAYOUT_SET);
if (keyboardLayoutSet == null) {
@@ -357,22 +345,6 @@ public final class SubtypeLocaleUtils {
return keyboardLayoutSet;
}
- // TODO: Get this information from the framework instead of maintaining here by ourselves.
- // Sorted list of known Right-To-Left language codes.
- private static final String[] SORTED_RTL_LANGUAGES = {
- "ar", // Arabic
- "fa", // Persian
- "iw", // Hebrew
- };
- static {
- Arrays.sort(SORTED_RTL_LANGUAGES);
- }
-
- public static boolean isRtlLanguage(final Locale locale) {
- final String language = locale.getLanguage();
- return Arrays.binarySearch(SORTED_RTL_LANGUAGES, language) >= 0;
- }
-
public static String getCombiningRulesExtraValue(final InputMethodSubtype subtype) {
return subtype.getExtraValueOf(COMBINING_RULES);
}
diff --git a/native/dicttoolkit/NativeFileList.mk b/native/dicttoolkit/NativeFileList.mk
index b39a24890..1c004f73a 100644
--- a/native/dicttoolkit/NativeFileList.mk
+++ b/native/dicttoolkit/NativeFileList.mk
@@ -24,11 +24,14 @@ LATIN_IME_DICT_TOOLKIT_SRC_FILES := \
makedict_executor.cpp) \
$(addprefix offdevice_intermediate_dict/, \
offdevice_intermediate_dict.cpp) \
- utils/command_utils.cpp
+ $(addprefix utils/, \
+ command_utils.cpp \
+ utf8_utils.cpp)
LATIN_IME_DICT_TOOLKIT_TEST_FILES := \
dict_toolkit_defines_test.cpp \
$(addprefix offdevice_intermediate_dict/, \
offdevice_intermediate_dict_test.cpp) \
$(addprefix utils/, \
- command_utils_test.cpp)
+ command_utils_test.cpp \
+ utf8_utils_test.cpp)
diff --git a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h
index dfdb6a897..13d26ba91 100644
--- a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h
+++ b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict.h
@@ -18,6 +18,7 @@
#define LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_H
#include "dict_toolkit_defines.h"
+#include "offdevice_intermediate_dict/offdevice_intermediate_dict_header.h"
#include "offdevice_intermediate_dict/offdevice_intermediate_dict_pt_node_array.h"
#include "suggest/core/dictionary/property/word_property.h"
#include "utils/int_array_view.h"
@@ -30,13 +31,18 @@ namespace dicttoolkit {
*/
class OffdeviceIntermediateDict final {
public:
+ OffdeviceIntermediateDict(const OffdeviceIntermediateDictHeader &header)
+ : mHeader(header), mRootPtNodeArray() {}
+
bool addWord(const WordProperty &wordProperty);
// The returned value will be invalid after modifying the dictionary. e.g. calling addWord().
const WordProperty *getWordProperty(const CodePointArrayView codePoints) const;
+ const OffdeviceIntermediateDictHeader &getHeader() const { return mHeader; }
private:
DISALLOW_ASSIGNMENT_OPERATOR(OffdeviceIntermediateDict);
+ const OffdeviceIntermediateDictHeader mHeader;
OffdeviceIntermediateDictPtNodeArray mRootPtNodeArray;
bool addWordInner(const CodePointArrayView codePoints, const WordProperty &wordProperty,
diff --git a/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h
new file mode 100644
index 000000000..440627a79
--- /dev/null
+++ b/native/dicttoolkit/src/offdevice_intermediate_dict/offdevice_intermediate_dict_header.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H
+#define LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H
+
+#include <map>
+#include <vector>
+
+#include "dict_toolkit_defines.h"
+
+namespace latinime {
+namespace dicttoolkit {
+
+class OffdeviceIntermediateDictHeader final {
+ public:
+ using AttributeMap = std::map<std::vector<int>, std::vector<int>>;
+
+ OffdeviceIntermediateDictHeader(const AttributeMap &attributesMap)
+ : mAttributeMap(attributesMap) {}
+
+ private:
+ DISALLOW_DEFAULT_CONSTRUCTOR(OffdeviceIntermediateDictHeader);
+ DISALLOW_ASSIGNMENT_OPERATOR(OffdeviceIntermediateDictHeader);
+
+ const AttributeMap mAttributeMap;
+};
+
+} // namespace dicttoolkit
+} // namespace latinime
+#endif // LATINIME_DICT_TOOLKIT_OFFDEVICE_INTERMEDIATE_DICT_HEADER_H
diff --git a/native/dicttoolkit/src/utils/utf8_utils.cpp b/native/dicttoolkit/src/utils/utf8_utils.cpp
new file mode 100644
index 000000000..0f349f512
--- /dev/null
+++ b/native/dicttoolkit/src/utils/utf8_utils.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/utf8_utils.h"
+
+#include "utils/char_utils.h"
+
+namespace latinime {
+namespace dicttoolkit {
+
+const size_t Utf8Utils::MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT = 4;
+const uint8_t Utf8Utils::FIRST_BYTE_MARKER_MASKS[] = {0, 0x80, 0xE0, 0xF0, 0xF8};
+const uint8_t Utf8Utils::FIRST_BYTE_MARKERS[] = {0, 0x00, 0xC0, 0xE0, 0xF0};
+const uint8_t Utf8Utils::FIRST_BYTE_CODE_POINT_BITS_MASKS[] = {0, 0x7F, 0x1F, 0x0F, 0x03};
+const int Utf8Utils::MAX_ENCODED_CODE_POINT_VALUES[] = {-1, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
+
+const uint8_t Utf8Utils::TRAILING_BYTE_CODE_POINT_BITS_MASK = 0x3F;
+const uint8_t Utf8Utils::TRAILING_BYTE_MARKER = 0x80;
+const size_t Utf8Utils::CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE = 6;
+
+/* static */ std::vector<int> Utf8Utils::getCodePoints(const std::string &utf8Str) {
+ std::vector<int> codePoints;
+ int remainingByteCountForCurrentCodePoint = 0;
+ int currentCodePointSequenceSize = 0;
+ int codePoint = 0;
+ for (const char c : utf8Str) {
+ if (remainingByteCountForCurrentCodePoint == 0) {
+ currentCodePointSequenceSize = getSequenceSizeByCheckingFirstByte(c);
+ if (currentCodePointSequenceSize <= 0) {
+ AKLOGE("%x is an invalid utf8 first byte value.", c);
+ return std::vector<int>();
+ }
+ remainingByteCountForCurrentCodePoint = currentCodePointSequenceSize;
+ codePoint = maskFirstByte(c, remainingByteCountForCurrentCodePoint);
+ } else {
+ codePoint <<= CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+ codePoint += maskTrailingByte(c);
+ }
+ remainingByteCountForCurrentCodePoint--;
+ if (remainingByteCountForCurrentCodePoint == 0) {
+ if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[currentCodePointSequenceSize - 1]) {
+ AKLOGE("%d bytes encode for codePoint(%x) is a redundant UTF-8 sequence.",
+ currentCodePointSequenceSize, codePoint);
+ return std::vector<int>();
+ }
+ codePoints.push_back(codePoint);
+ }
+ }
+ return codePoints;
+}
+
+/* static */ int Utf8Utils::getSequenceSizeByCheckingFirstByte(const uint8_t firstByte) {
+ for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
+ if ((firstByte & FIRST_BYTE_MARKER_MASKS[i]) == FIRST_BYTE_MARKERS[i]) {
+ return i;
+ }
+ }
+ // Not a valid utf8 char first byte.
+ return -1;
+}
+
+/* static */ AK_FORCE_INLINE int Utf8Utils::maskFirstByte(const uint8_t firstByte,
+ const int sequenceSize) {
+ return firstByte & FIRST_BYTE_CODE_POINT_BITS_MASKS[sequenceSize];
+}
+
+/* static */ AK_FORCE_INLINE int Utf8Utils::maskTrailingByte(const uint8_t secondOrLaterByte) {
+ return secondOrLaterByte & TRAILING_BYTE_CODE_POINT_BITS_MASK;
+}
+
+/* static */ std::string Utf8Utils::getUtf8String(const CodePointArrayView codePoints) {
+ std::string utf8String;
+ for (const int codePoint : codePoints) {
+ const int sequenceSize = getSequenceSizeToEncodeCodePoint(codePoint);
+ if (sequenceSize <= 0) {
+ AKLOGE("Cannot encode code point (%d).", codePoint);
+ return std::string();
+ }
+ const int trailingByteCount = sequenceSize - 1;
+ // Output first byte.
+ const int value = codePoint >> (trailingByteCount * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE);
+ utf8String.push_back(static_cast<char>(value | FIRST_BYTE_MARKERS[sequenceSize]));
+ // Output second and later bytes.
+ for (int i = 1; i < sequenceSize; ++i) {
+ const int shiftAmount = (trailingByteCount - i) * CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+ const int value = (codePoint >> shiftAmount) & TRAILING_BYTE_CODE_POINT_BITS_MASK;
+ utf8String.push_back(static_cast<char>(value | TRAILING_BYTE_MARKER));
+ }
+ }
+ return utf8String;
+}
+
+/* static */ int Utf8Utils::getSequenceSizeToEncodeCodePoint(const int codePoint) {
+ if (codePoint < 0) {
+ return -1;
+ }
+ for (size_t i = 1; i <= MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT; ++i) {
+ if (codePoint <= MAX_ENCODED_CODE_POINT_VALUES[i]) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+} // namespace dicttoolkit
+} // namespace latinime
diff --git a/native/dicttoolkit/src/utils/utf8_utils.h b/native/dicttoolkit/src/utils/utf8_utils.h
new file mode 100644
index 000000000..35818e56c
--- /dev/null
+++ b/native/dicttoolkit/src/utils/utf8_utils.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
+#define LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "dict_toolkit_defines.h"
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace dicttoolkit {
+
+class Utf8Utils {
+public:
+ static std::vector<int> getCodePoints(const std::string &utf8Str);
+ static std::string getUtf8String(const CodePointArrayView codePoints);
+
+private:
+ DISALLOW_IMPLICIT_CONSTRUCTORS(Utf8Utils);
+
+ // Values indexed by sequence size.
+ static const size_t MAX_SEQUENCE_SIZE_FOR_A_CODE_POINT;
+ static const uint8_t FIRST_BYTE_MARKER_MASKS[];
+ static const uint8_t FIRST_BYTE_MARKERS[];
+ static const uint8_t FIRST_BYTE_CODE_POINT_BITS_MASKS[];
+ static const int MAX_ENCODED_CODE_POINT_VALUES[];
+
+ static const uint8_t TRAILING_BYTE_CODE_POINT_BITS_MASK;
+ static const uint8_t TRAILING_BYTE_MARKER;
+ static const size_t CODE_POINT_BIT_COUNT_IN_TRAILING_BYTE;
+
+ static int getSequenceSizeByCheckingFirstByte(const uint8_t firstByte);
+ static int maskFirstByte(const uint8_t firstByte, const int encodeSize);
+ static int maskTrailingByte(const uint8_t secondOrLaterByte);
+ static int getSequenceSizeToEncodeCodePoint(const int codePoint);
+};
+} // namespace dicttoolkit
+} // namespace latinime
+#endif // LATINIME_DICT_TOOLKIT_UTF8_UTILS_H
diff --git a/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp b/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp
index 3bcb89e46..f2e24ab5f 100644
--- a/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp
+++ b/native/dicttoolkit/tests/offdevice_intermediate_dict/offdevice_intermediate_dict_test.cpp
@@ -41,7 +41,8 @@ const WordProperty getDummpWordProperty(const std::vector<int> &&codePoints) {
}
TEST(OffdeviceIntermediateDictTest, TestAddWordProperties) {
- OffdeviceIntermediateDict dict;
+ OffdeviceIntermediateDict dict = OffdeviceIntermediateDict(
+ OffdeviceIntermediateDictHeader(OffdeviceIntermediateDictHeader::AttributeMap()));
EXPECT_EQ(nullptr, dict.getWordProperty(CodePointArrayView()));
const WordProperty wordProperty0 = getDummpWordProperty(getCodePointVector("abcd"));
diff --git a/native/dicttoolkit/tests/utils/utf8_utils_test.cpp b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp
new file mode 100644
index 000000000..9c59a8b05
--- /dev/null
+++ b/native/dicttoolkit/tests/utils/utf8_utils_test.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utils/utf8_utils.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "utils/int_array_view.h"
+
+namespace latinime {
+namespace dicttoolkit {
+namespace {
+
+TEST(Utf8UtilsTests, TestGetCodePoints) {
+ {
+ const std::vector<int> codePoints = Utf8Utils::getCodePoints("");
+ EXPECT_EQ(0u, codePoints.size());
+ }
+ {
+ const std::vector<int> codePoints = Utf8Utils::getCodePoints("test");
+ EXPECT_EQ(4u, codePoints.size());
+ EXPECT_EQ('t', codePoints[0]);
+ EXPECT_EQ('e', codePoints[1]);
+ EXPECT_EQ('s', codePoints[2]);
+ EXPECT_EQ('t', codePoints[3]);
+ }
+ {
+ const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\u3042a\u03C2\u0410");
+ EXPECT_EQ(4u, codePoints.size());
+ EXPECT_EQ(0x3042, codePoints[0]); // HIRAGANA LETTER A
+ EXPECT_EQ('a', codePoints[1]);
+ EXPECT_EQ(0x03C2, codePoints[2]); // CYRILLIC CAPITAL LETTER A
+ EXPECT_EQ(0x0410, codePoints[3]); // GREEK SMALL LETTER FINAL SIGMA
+ }
+ {
+ const std::vector<int> codePoints = Utf8Utils::getCodePoints(u8"\U0001F36A?\U0001F752");
+ EXPECT_EQ(3u, codePoints.size());
+ EXPECT_EQ(0x1F36A, codePoints[0]); // COOKIE
+ EXPECT_EQ('?', codePoints[1]);
+ EXPECT_EQ(0x1F752, codePoints[2]); // ALCHEMICAL SYMBOL FOR STARRED TRIDENT
+ }
+
+ // Redundant UTF-8 sequences must be rejected.
+ EXPECT_TRUE(Utf8Utils::getCodePoints("\xC0\xAF").empty());
+ EXPECT_TRUE(Utf8Utils::getCodePoints("\xE0\x80\xAF").empty());
+ EXPECT_TRUE(Utf8Utils::getCodePoints("\xF0\x80\x80\xAF").empty());
+}
+
+TEST(Utf8UtilsTests, TestGetUtf8String) {
+ {
+ const std::vector<int> codePoints = {'t', 'e', 's', 't'};
+ EXPECT_EQ("test", Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
+ }
+ {
+ const std::vector<int> codePoints = {
+ 0x00E0 /* LATIN SMALL LETTER A WITH GRAVE */,
+ 0x03C2 /* GREEK SMALL LETTER FINAL SIGMA */,
+ 0x0430 /* CYRILLIC SMALL LETTER A */,
+ 0x3042 /* HIRAGANA LETTER A */,
+ 0x1F36A /* COOKIE */,
+ 0x1F752 /* ALCHEMICAL SYMBOL FOR STARRED TRIDENT */
+ };
+ EXPECT_EQ(u8"\u00E0\u03C2\u0430\u3042\U0001F36A\U0001F752",
+ Utf8Utils::getUtf8String(CodePointArrayView(codePoints)));
+ }
+}
+
+} // namespace
+} // namespace dicttoolkit
+} // namespace latinime
diff --git a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
index 6c60fdc0c..d833b9736 100644
--- a/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
+++ b/tests/src/com/android/inputmethod/latin/makedict/BinaryDictDecoderEncoderTests.java
@@ -26,7 +26,6 @@ import com.android.inputmethod.latin.BinaryDictionary;
import com.android.inputmethod.latin.common.CodePointUtils;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.DictBuffer;
-import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
diff --git a/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java b/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java
index 54f478f5a..03dcdfc78 100644
--- a/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java
+++ b/tests/src/com/android/inputmethod/latin/utils/SubtypeLocaleUtilsTests.java
@@ -434,8 +434,8 @@ public class SubtypeLocaleUtilsTests extends AndroidTestCase {
// locale layout | display name
// ------ -------------- - ----------------------
// sr south_slavic F Српски
- // sr_ZZ serbian_qwertz F српски (латиница)
- // sr_ZZ qwerty T српски (QWERTY)
+ // sr_ZZ serbian_qwertz F Српски (латиница)
+ // sr_ZZ qwerty T Српски (QWERTY)
public void testSerbianLatinSubtypesInSerbianSystemLocale() {
final RunInLocale<Void> tests = new RunInLocale<Void>() {
@@ -445,12 +445,10 @@ public class SubtypeLocaleUtilsTests extends AndroidTestCase {
SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR));
// These are preliminary subtypes and may not exist.
if (SR_LATN != null) {
- // TODO: Uncommented because of the current translation of these strings
- // in Seriban are described in Latin script.
-// assertEquals("sr_ZZ", "српски (латиница)",
-// SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN));
-// assertEquals("sr_ZZ", "српски (QWERTY)",
-// SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN_QWERTY));
+ assertEquals("sr_ZZ", "Српски (латиница)",
+ SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN));
+ assertEquals("sr_ZZ", "Српски (QWERTY)",
+ SubtypeLocaleUtils.getSubtypeDisplayNameInSystemLocale(SR_LATN_QWERTY));
}
return null;
}