aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src/utils/char_utils.h
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src/utils/char_utils.h')
-rw-r--r--native/jni/src/utils/char_utils.h27
1 files changed, 27 insertions, 0 deletions
diff --git a/native/jni/src/utils/char_utils.h b/native/jni/src/utils/char_utils.h
index 41663c81a..63786502b 100644
--- a/native/jni/src/utils/char_utils.h
+++ b/native/jni/src/utils/char_utils.h
@@ -18,6 +18,8 @@
#define LATINIME_CHAR_UTILS_H
#include <cctype>
+#include <cstring>
+#include <vector>
#include "defines.h"
@@ -85,11 +87,36 @@ class CharUtils {
return spaceCount;
}
+ static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
+ return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
+ }
+
static unsigned short latin_tolower(const unsigned short c);
+ static const std::vector<int> EMPTY_STRING;
+
+ // Returns updated code point count. Returns 0 when the code points cannot be marked as a
+ // Beginning-of-Sentence.
+ static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
+ const int codePointCount, const int maxCodePoint) {
+ if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
+ // Marker has already been attached.
+ return codePointCount;
+ }
+ if (codePointCount >= maxCodePoint) {
+ // the code points cannot be marked as a Beginning-of-Sentence.
+ return 0;
+ }
+ memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
+ codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
+ return codePointCount + 1;
+ }
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
+ static const int MIN_UNICODE_CODE_POINT;
+ static const int MAX_UNICODE_CODE_POINT;
+
/**
* Table mapping most combined Latin, Greek, and Cyrillic characters
* to their base characters. If c is in range, BASE_CHARS[c] == c