aboutsummaryrefslogtreecommitdiffstats
path: root/native/jni/src
diff options
context:
space:
mode:
Diffstat (limited to 'native/jni/src')
-rw-r--r--native/jni/src/binary_format.h12
-rw-r--r--native/jni/src/char_utils.cpp190
-rw-r--r--native/jni/src/char_utils.h3
-rw-r--r--native/jni/src/defines.h5
-rw-r--r--native/jni/src/digraph_utils.cpp4
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node.h5
-rw-r--r--native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h17
-rw-r--r--native/jni/src/suggest/core/policy/weighting.cpp30
-rw-r--r--native/jni/src/suggest/core/policy/weighting.h6
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.cpp2
-rw-r--r--native/jni/src/suggest/core/session/dic_traverse_session.h13
-rw-r--r--native/jni/src/suggest/core/suggest.cpp24
-rw-r--r--native/jni/src/suggest/core/suggest.h15
-rw-r--r--native/jni/src/suggest/policyimpl/typing/scoring_params.cpp6
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_traversal.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_traversal.h8
-rw-r--r--native/jni/src/suggest/policyimpl/typing/typing_weighting.h24
17 files changed, 261 insertions, 107 deletions
diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h
index 1c4061fd8..2d2e19501 100644
--- a/native/jni/src/binary_format.h
+++ b/native/jni/src/binary_format.h
@@ -92,6 +92,7 @@ class BinaryFormat {
const int unigramProbability, const int bigramProbability);
static int getProbability(const int position, const std::map<int, int> *bigramMap,
const uint8_t *bigramFilter, const int unigramProbability);
+ static float getMultiWordCostMultiplier(const uint8_t *const dict);
// Flags for special processing
// Those *must* match the flags in makedict (BinaryDictInputOutput#*_PROCESSING_FLAG) or
@@ -241,6 +242,17 @@ AK_FORCE_INLINE int BinaryFormat::getGroupCountAndForwardPointer(const uint8_t *
return ((msb & 0x7F) << 8) | dict[(*pos)++];
}
+inline float BinaryFormat::getMultiWordCostMultiplier(const uint8_t *const dict) {
+ const int headerValue = readHeaderValueInt(dict, "MULTIPLE_WORDS_DEMOTION_RATE");
+ if (headerValue == S_INT_MIN) {
+ return 1.0f;
+ }
+ if (headerValue <= 0) {
+ return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
+ }
+ return 100.0f / static_cast<float>(headerValue);
+}
+
inline uint8_t BinaryFormat::getFlagsAndForwardPointer(const uint8_t *const dict, int *pos) {
return dict[(*pos)++];
}
diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/char_utils.cpp
index 8d917ea74..e219beb62 100644
--- a/native/jni/src/char_utils.cpp
+++ b/native/jni/src/char_utils.cpp
@@ -45,18 +45,16 @@ struct LatinCapitalSmallPair {
extern "C" int main() {
for (unsigned short c = 0; c < 0xFFFF; c++) {
- const unsigned short baseC = c < NELEMS(BASE_CHARS) ? BASE_CHARS[c] : c;
- if (baseC <= 0x7F) continue;
- const unsigned short icu4cLowerBaseC = u_tolower(baseC);
- const unsigned short myLowerBaseC = latin_tolower(baseC);
- if (baseC != icu4cLowerBaseC) {
+ if (c <= 0x7F) continue;
+ const unsigned short icu4cLowerC = u_tolower(c);
+ const unsigned short myLowerC = latin_tolower(c);
+ if (c != icu4cLowerC) {
#ifdef CONFIRMING_CHAR_UTILS
- if (icu4cLowerBaseC != myLowerBaseC) {
- fprintf(stderr, "icu4cLowerBaseC != myLowerBaseC, 0x%04X, 0x%04X\n",
- icu4cLowerBaseC, myLowerBaseC);
+ if (icu4cLowerC != myLowerC) {
+ fprintf(stderr, "icu4cLowerC != myLowerC, 0x%04X, 0x%04X\n", icu4cLowerC, myLowerC);
}
#else // CONFIRMING_CHAR_UTILS
- printf("0x%04X, 0x%04X\n", baseC, icu4cLowerBaseC);
+ printf("0x%04X, 0x%04X\n", c, icu4cLowerC);
#endif // CONFIRMING_CHAR_UTILS
}
}
@@ -77,14 +75,99 @@ extern "C" int main() {
* $
*/
static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
+ { 0x00C0, 0x00E0 }, // LATIN CAPITAL LETTER A WITH GRAVE
+ { 0x00C1, 0x00E1 }, // LATIN CAPITAL LETTER A WITH ACUTE
+ { 0x00C2, 0x00E2 }, // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
+ { 0x00C3, 0x00E3 }, // LATIN CAPITAL LETTER A WITH TILDE
+ { 0x00C4, 0x00E4 }, // LATIN CAPITAL LETTER A WITH DIAERESIS
+ { 0x00C5, 0x00E5 }, // LATIN CAPITAL LETTER A WITH RING ABOVE
{ 0x00C6, 0x00E6 }, // LATIN CAPITAL LETTER AE
+ { 0x00C7, 0x00E7 }, // LATIN CAPITAL LETTER C WITH CEDILLA
+ { 0x00C8, 0x00E8 }, // LATIN CAPITAL LETTER E WITH GRAVE
+ { 0x00C9, 0x00E9 }, // LATIN CAPITAL LETTER E WITH ACUTE
+ { 0x00CA, 0x00EA }, // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
+ { 0x00CB, 0x00EB }, // LATIN CAPITAL LETTER E WITH DIAERESIS
+ { 0x00CC, 0x00EC }, // LATIN CAPITAL LETTER I WITH GRAVE
+ { 0x00CD, 0x00ED }, // LATIN CAPITAL LETTER I WITH ACUTE
+ { 0x00CE, 0x00EE }, // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
+ { 0x00CF, 0x00EF }, // LATIN CAPITAL LETTER I WITH DIAERESIS
{ 0x00D0, 0x00F0 }, // LATIN CAPITAL LETTER ETH
+ { 0x00D1, 0x00F1 }, // LATIN CAPITAL LETTER N WITH TILDE
+ { 0x00D2, 0x00F2 }, // LATIN CAPITAL LETTER O WITH GRAVE
+ { 0x00D3, 0x00F3 }, // LATIN CAPITAL LETTER O WITH ACUTE
+ { 0x00D4, 0x00F4 }, // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
+ { 0x00D5, 0x00F5 }, // LATIN CAPITAL LETTER O WITH TILDE
+ { 0x00D6, 0x00F6 }, // LATIN CAPITAL LETTER O WITH DIAERESIS
+ { 0x00D8, 0x00F8 }, // LATIN CAPITAL LETTER O WITH STROKE
+ { 0x00D9, 0x00F9 }, // LATIN CAPITAL LETTER U WITH GRAVE
+ { 0x00DA, 0x00FA }, // LATIN CAPITAL LETTER U WITH ACUTE
+ { 0x00DB, 0x00FB }, // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
+ { 0x00DC, 0x00FC }, // LATIN CAPITAL LETTER U WITH DIAERESIS
+ { 0x00DD, 0x00FD }, // LATIN CAPITAL LETTER Y WITH ACUTE
{ 0x00DE, 0x00FE }, // LATIN CAPITAL LETTER THORN
+ { 0x0100, 0x0101 }, // LATIN CAPITAL LETTER A WITH MACRON
+ { 0x0102, 0x0103 }, // LATIN CAPITAL LETTER A WITH BREVE
+ { 0x0104, 0x0105 }, // LATIN CAPITAL LETTER A WITH OGONEK
+ { 0x0106, 0x0107 }, // LATIN CAPITAL LETTER C WITH ACUTE
+ { 0x0108, 0x0109 }, // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
+ { 0x010A, 0x010B }, // LATIN CAPITAL LETTER C WITH DOT ABOVE
+ { 0x010C, 0x010D }, // LATIN CAPITAL LETTER C WITH CARON
+ { 0x010E, 0x010F }, // LATIN CAPITAL LETTER D WITH CARON
{ 0x0110, 0x0111 }, // LATIN CAPITAL LETTER D WITH STROKE
+ { 0x0112, 0x0113 }, // LATIN CAPITAL LETTER E WITH MACRON
+ { 0x0114, 0x0115 }, // LATIN CAPITAL LETTER E WITH BREVE
+ { 0x0116, 0x0117 }, // LATIN CAPITAL LETTER E WITH DOT ABOVE
+ { 0x0118, 0x0119 }, // LATIN CAPITAL LETTER E WITH OGONEK
+ { 0x011A, 0x011B }, // LATIN CAPITAL LETTER E WITH CARON
+ { 0x011C, 0x011D }, // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
+ { 0x011E, 0x011F }, // LATIN CAPITAL LETTER G WITH BREVE
+ { 0x0120, 0x0121 }, // LATIN CAPITAL LETTER G WITH DOT ABOVE
+ { 0x0122, 0x0123 }, // LATIN CAPITAL LETTER G WITH CEDILLA
+ { 0x0124, 0x0125 }, // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
{ 0x0126, 0x0127 }, // LATIN CAPITAL LETTER H WITH STROKE
+ { 0x0128, 0x0129 }, // LATIN CAPITAL LETTER I WITH TILDE
+ { 0x012A, 0x012B }, // LATIN CAPITAL LETTER I WITH MACRON
+ { 0x012C, 0x012D }, // LATIN CAPITAL LETTER I WITH BREVE
+ { 0x012E, 0x012F }, // LATIN CAPITAL LETTER I WITH OGONEK
+ { 0x0130, 0x0069 }, // LATIN CAPITAL LETTER I WITH DOT ABOVE
+ { 0x0132, 0x0133 }, // LATIN CAPITAL LIGATURE IJ
+ { 0x0134, 0x0135 }, // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
+ { 0x0136, 0x0137 }, // LATIN CAPITAL LETTER K WITH CEDILLA
+ { 0x0139, 0x013A }, // LATIN CAPITAL LETTER L WITH ACUTE
+ { 0x013B, 0x013C }, // LATIN CAPITAL LETTER L WITH CEDILLA
+ { 0x013D, 0x013E }, // LATIN CAPITAL LETTER L WITH CARON
+ { 0x013F, 0x0140 }, // LATIN CAPITAL LETTER L WITH MIDDLE DOT
+ { 0x0141, 0x0142 }, // LATIN CAPITAL LETTER L WITH STROKE
+ { 0x0143, 0x0144 }, // LATIN CAPITAL LETTER N WITH ACUTE
+ { 0x0145, 0x0146 }, // LATIN CAPITAL LETTER N WITH CEDILLA
+ { 0x0147, 0x0148 }, // LATIN CAPITAL LETTER N WITH CARON
{ 0x014A, 0x014B }, // LATIN CAPITAL LETTER ENG
+ { 0x014C, 0x014D }, // LATIN CAPITAL LETTER O WITH MACRON
+ { 0x014E, 0x014F }, // LATIN CAPITAL LETTER O WITH BREVE
+ { 0x0150, 0x0151 }, // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
{ 0x0152, 0x0153 }, // LATIN CAPITAL LIGATURE OE
+ { 0x0154, 0x0155 }, // LATIN CAPITAL LETTER R WITH ACUTE
+ { 0x0156, 0x0157 }, // LATIN CAPITAL LETTER R WITH CEDILLA
+ { 0x0158, 0x0159 }, // LATIN CAPITAL LETTER R WITH CARON
+ { 0x015A, 0x015B }, // LATIN CAPITAL LETTER S WITH ACUTE
+ { 0x015C, 0x015D }, // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
+ { 0x015E, 0x015F }, // LATIN CAPITAL LETTER S WITH CEDILLA
+ { 0x0160, 0x0161 }, // LATIN CAPITAL LETTER S WITH CARON
+ { 0x0162, 0x0163 }, // LATIN CAPITAL LETTER T WITH CEDILLA
+ { 0x0164, 0x0165 }, // LATIN CAPITAL LETTER T WITH CARON
{ 0x0166, 0x0167 }, // LATIN CAPITAL LETTER T WITH STROKE
+ { 0x0168, 0x0169 }, // LATIN CAPITAL LETTER U WITH TILDE
+ { 0x016A, 0x016B }, // LATIN CAPITAL LETTER U WITH MACRON
+ { 0x016C, 0x016D }, // LATIN CAPITAL LETTER U WITH BREVE
+ { 0x016E, 0x016F }, // LATIN CAPITAL LETTER U WITH RING ABOVE
+ { 0x0170, 0x0171 }, // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
+ { 0x0172, 0x0173 }, // LATIN CAPITAL LETTER U WITH OGONEK
+ { 0x0174, 0x0175 }, // LATIN CAPITAL LETTER W WITH CIRCUMFLEX
+ { 0x0176, 0x0177 }, // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
+ { 0x0178, 0x00FF }, // LATIN CAPITAL LETTER Y WITH DIAERESIS
+ { 0x0179, 0x017A }, // LATIN CAPITAL LETTER Z WITH ACUTE
+ { 0x017B, 0x017C }, // LATIN CAPITAL LETTER Z WITH DOT ABOVE
+ { 0x017D, 0x017E }, // LATIN CAPITAL LETTER Z WITH CARON
{ 0x0181, 0x0253 }, // LATIN CAPITAL LETTER B WITH HOOK
{ 0x0182, 0x0183 }, // LATIN CAPITAL LETTER B WITH TOPBAR
{ 0x0184, 0x0185 }, // LATIN CAPITAL LETTER TONE SIX
@@ -105,6 +188,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x019C, 0x026F }, // LATIN CAPITAL LETTER TURNED M
{ 0x019D, 0x0272 }, // LATIN CAPITAL LETTER N WITH LEFT HOOK
{ 0x019F, 0x0275 }, // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
+ { 0x01A0, 0x01A1 }, // LATIN CAPITAL LETTER O WITH HORN
{ 0x01A2, 0x01A3 }, // LATIN CAPITAL LETTER OI
{ 0x01A4, 0x01A5 }, // LATIN CAPITAL LETTER P WITH HOOK
{ 0x01A6, 0x0280 }, // LATIN LETTER YR
@@ -112,6 +196,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x01A9, 0x0283 }, // LATIN CAPITAL LETTER ESH
{ 0x01AC, 0x01AD }, // LATIN CAPITAL LETTER T WITH HOOK
{ 0x01AE, 0x0288 }, // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
+ { 0x01AF, 0x01B0 }, // LATIN CAPITAL LETTER U WITH HORN
{ 0x01B1, 0x028A }, // LATIN CAPITAL LETTER UPSILON
{ 0x01B2, 0x028B }, // LATIN CAPITAL LETTER V WITH HOOK
{ 0x01B3, 0x01B4 }, // LATIN CAPITAL LETTER Y WITH HOOK
@@ -119,13 +204,64 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x01B7, 0x0292 }, // LATIN CAPITAL LETTER EZH
{ 0x01B8, 0x01B9 }, // LATIN CAPITAL LETTER EZH REVERSED
{ 0x01BC, 0x01BD }, // LATIN CAPITAL LETTER TONE FIVE
+ { 0x01C4, 0x01C6 }, // LATIN CAPITAL LETTER DZ WITH CARON
+ { 0x01C5, 0x01C6 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
+ { 0x01C7, 0x01C9 }, // LATIN CAPITAL LETTER LJ
+ { 0x01C8, 0x01C9 }, // LATIN CAPITAL LETTER L WITH SMALL LETTER J
+ { 0x01CA, 0x01CC }, // LATIN CAPITAL LETTER NJ
+ { 0x01CB, 0x01CC }, // LATIN CAPITAL LETTER N WITH SMALL LETTER J
+ { 0x01CD, 0x01CE }, // LATIN CAPITAL LETTER A WITH CARON
+ { 0x01CF, 0x01D0 }, // LATIN CAPITAL LETTER I WITH CARON
+ { 0x01D1, 0x01D2 }, // LATIN CAPITAL LETTER O WITH CARON
+ { 0x01D3, 0x01D4 }, // LATIN CAPITAL LETTER U WITH CARON
+ { 0x01D5, 0x01D6 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
+ { 0x01D7, 0x01D8 }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
+ { 0x01D9, 0x01DA }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
+ { 0x01DB, 0x01DC }, // LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
+ { 0x01DE, 0x01DF }, // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON
+ { 0x01E0, 0x01E1 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON
+ { 0x01E2, 0x01E3 }, // LATIN CAPITAL LETTER AE WITH MACRON
{ 0x01E4, 0x01E5 }, // LATIN CAPITAL LETTER G WITH STROKE
+ { 0x01E6, 0x01E7 }, // LATIN CAPITAL LETTER G WITH CARON
+ { 0x01E8, 0x01E9 }, // LATIN CAPITAL LETTER K WITH CARON
+ { 0x01EA, 0x01EB }, // LATIN CAPITAL LETTER O WITH OGONEK
+ { 0x01EC, 0x01ED }, // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON
+ { 0x01EE, 0x01EF }, // LATIN CAPITAL LETTER EZH WITH CARON
+ { 0x01F1, 0x01F3 }, // LATIN CAPITAL LETTER DZ
+ { 0x01F2, 0x01F3 }, // LATIN CAPITAL LETTER D WITH SMALL LETTER Z
+ { 0x01F4, 0x01F5 }, // LATIN CAPITAL LETTER G WITH ACUTE
{ 0x01F6, 0x0195 }, // LATIN CAPITAL LETTER HWAIR
{ 0x01F7, 0x01BF }, // LATIN CAPITAL LETTER WYNN
+ { 0x01F8, 0x01F9 }, // LATIN CAPITAL LETTER N WITH GRAVE
+ { 0x01FA, 0x01FB }, // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
+ { 0x01FC, 0x01FD }, // LATIN CAPITAL LETTER AE WITH ACUTE
+ { 0x01FE, 0x01FF }, // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
+ { 0x0200, 0x0201 }, // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
+ { 0x0202, 0x0203 }, // LATIN CAPITAL LETTER A WITH INVERTED BREVE
+ { 0x0204, 0x0205 }, // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
+ { 0x0206, 0x0207 }, // LATIN CAPITAL LETTER E WITH INVERTED BREVE
+ { 0x0208, 0x0209 }, // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+ { 0x020A, 0x020B }, // LATIN CAPITAL LETTER I WITH INVERTED BREVE
+ { 0x020C, 0x020D }, // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
+ { 0x020E, 0x020F }, // LATIN CAPITAL LETTER O WITH INVERTED BREVE
+ { 0x0210, 0x0211 }, // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
+ { 0x0212, 0x0213 }, // LATIN CAPITAL LETTER R WITH INVERTED BREVE
+ { 0x0214, 0x0215 }, // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
+ { 0x0216, 0x0217 }, // LATIN CAPITAL LETTER U WITH INVERTED BREVE
+ { 0x0218, 0x0219 }, // LATIN CAPITAL LETTER S WITH COMMA BELOW
+ { 0x021A, 0x021B }, // LATIN CAPITAL LETTER T WITH COMMA BELOW
{ 0x021C, 0x021D }, // LATIN CAPITAL LETTER YOGH
+ { 0x021E, 0x021F }, // LATIN CAPITAL LETTER H WITH CARON
{ 0x0220, 0x019E }, // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
{ 0x0222, 0x0223 }, // LATIN CAPITAL LETTER OU
{ 0x0224, 0x0225 }, // LATIN CAPITAL LETTER Z WITH HOOK
+ { 0x0226, 0x0227 }, // LATIN CAPITAL LETTER A WITH DOT ABOVE
+ { 0x0228, 0x0229 }, // LATIN CAPITAL LETTER E WITH CEDILLA
+ { 0x022A, 0x022B }, // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
+ { 0x022C, 0x022D }, // LATIN CAPITAL LETTER O WITH TILDE AND MACRON
+ { 0x022E, 0x022F }, // LATIN CAPITAL LETTER O WITH DOT ABOVE
+ { 0x0230, 0x0231 }, // LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON
+ { 0x0232, 0x0233 }, // LATIN CAPITAL LETTER Y WITH MACRON
{ 0x023A, 0x2C65 }, // LATIN CAPITAL LETTER A WITH STROKE
{ 0x023B, 0x023C }, // LATIN CAPITAL LETTER C WITH STROKE
{ 0x023D, 0x019A }, // LATIN CAPITAL LETTER L WITH BAR
@@ -142,6 +278,13 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x0370, 0x0371 }, // GREEK CAPITAL LETTER HETA
{ 0x0372, 0x0373 }, // GREEK CAPITAL LETTER ARCHAIC SAMPI
{ 0x0376, 0x0377 }, // GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+ { 0x0386, 0x03AC }, // GREEK CAPITAL LETTER ALPHA WITH TONOS
+ { 0x0388, 0x03AD }, // GREEK CAPITAL LETTER EPSILON WITH TONOS
+ { 0x0389, 0x03AE }, // GREEK CAPITAL LETTER ETA WITH TONOS
+ { 0x038A, 0x03AF }, // GREEK CAPITAL LETTER IOTA WITH TONOS
+ { 0x038C, 0x03CC }, // GREEK CAPITAL LETTER OMICRON WITH TONOS
+ { 0x038E, 0x03CD }, // GREEK CAPITAL LETTER UPSILON WITH TONOS
+ { 0x038F, 0x03CE }, // GREEK CAPITAL LETTER OMEGA WITH TONOS
{ 0x0391, 0x03B1 }, // GREEK CAPITAL LETTER ALPHA
{ 0x0392, 0x03B2 }, // GREEK CAPITAL LETTER BETA
{ 0x0393, 0x03B3 }, // GREEK CAPITAL LETTER GAMMA
@@ -166,6 +309,8 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x03A7, 0x03C7 }, // GREEK CAPITAL LETTER CHI
{ 0x03A8, 0x03C8 }, // GREEK CAPITAL LETTER PSI
{ 0x03A9, 0x03C9 }, // GREEK CAPITAL LETTER OMEGA
+ { 0x03AA, 0x03CA }, // GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
+ { 0x03AB, 0x03CB }, // GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
{ 0x03CF, 0x03D7 }, // GREEK CAPITAL KAI SYMBOL
{ 0x03D8, 0x03D9 }, // GREEK LETTER ARCHAIC KOPPA
{ 0x03DA, 0x03DB }, // GREEK LETTER STIGMA
@@ -179,19 +324,28 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x03EA, 0x03EB }, // COPTIC CAPITAL LETTER GANGIA
{ 0x03EC, 0x03ED }, // COPTIC CAPITAL LETTER SHIMA
{ 0x03EE, 0x03EF }, // COPTIC CAPITAL LETTER DEI
+ { 0x03F4, 0x03B8 }, // GREEK CAPITAL THETA SYMBOL
{ 0x03F7, 0x03F8 }, // GREEK CAPITAL LETTER SHO
+ { 0x03F9, 0x03F2 }, // GREEK CAPITAL LUNATE SIGMA SYMBOL
{ 0x03FA, 0x03FB }, // GREEK CAPITAL LETTER SAN
{ 0x03FD, 0x037B }, // GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL
{ 0x03FE, 0x037C }, // GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL
{ 0x03FF, 0x037D }, // GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
+ { 0x0400, 0x0450 }, // CYRILLIC CAPITAL LETTER IE WITH GRAVE
+ { 0x0401, 0x0451 }, // CYRILLIC CAPITAL LETTER IO
{ 0x0402, 0x0452 }, // CYRILLIC CAPITAL LETTER DJE
+ { 0x0403, 0x0453 }, // CYRILLIC CAPITAL LETTER GJE
{ 0x0404, 0x0454 }, // CYRILLIC CAPITAL LETTER UKRAINIAN IE
{ 0x0405, 0x0455 }, // CYRILLIC CAPITAL LETTER DZE
{ 0x0406, 0x0456 }, // CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
+ { 0x0407, 0x0457 }, // CYRILLIC CAPITAL LETTER YI
{ 0x0408, 0x0458 }, // CYRILLIC CAPITAL LETTER JE
{ 0x0409, 0x0459 }, // CYRILLIC CAPITAL LETTER LJE
{ 0x040A, 0x045A }, // CYRILLIC CAPITAL LETTER NJE
{ 0x040B, 0x045B }, // CYRILLIC CAPITAL LETTER TSHE
+ { 0x040C, 0x045C }, // CYRILLIC CAPITAL LETTER KJE
+ { 0x040D, 0x045D }, // CYRILLIC CAPITAL LETTER I WITH GRAVE
+ { 0x040E, 0x045E }, // CYRILLIC CAPITAL LETTER SHORT U
{ 0x040F, 0x045F }, // CYRILLIC CAPITAL LETTER DZHE
{ 0x0410, 0x0430 }, // CYRILLIC CAPITAL LETTER A
{ 0x0411, 0x0431 }, // CYRILLIC CAPITAL LETTER BE
@@ -236,6 +390,7 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x0470, 0x0471 }, // CYRILLIC CAPITAL LETTER PSI
{ 0x0472, 0x0473 }, // CYRILLIC CAPITAL LETTER FITA
{ 0x0474, 0x0475 }, // CYRILLIC CAPITAL LETTER IZHITSA
+ { 0x0476, 0x0477 }, // CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT
{ 0x0478, 0x0479 }, // CYRILLIC CAPITAL LETTER UK
{ 0x047A, 0x047B }, // CYRILLIC CAPITAL LETTER ROUND OMEGA
{ 0x047C, 0x047D }, // CYRILLIC CAPITAL LETTER OMEGA WITH TITLO
@@ -269,17 +424,34 @@ static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
{ 0x04BC, 0x04BD }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE
{ 0x04BE, 0x04BF }, // CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER
{ 0x04C0, 0x04CF }, // CYRILLIC LETTER PALOCHKA
+ { 0x04C1, 0x04C2 }, // CYRILLIC CAPITAL LETTER ZHE WITH BREVE
{ 0x04C3, 0x04C4 }, // CYRILLIC CAPITAL LETTER KA WITH HOOK
{ 0x04C5, 0x04C6 }, // CYRILLIC CAPITAL LETTER EL WITH TAIL
{ 0x04C7, 0x04C8 }, // CYRILLIC CAPITAL LETTER EN WITH HOOK
{ 0x04C9, 0x04CA }, // CYRILLIC CAPITAL LETTER EN WITH TAIL
{ 0x04CB, 0x04CC }, // CYRILLIC CAPITAL LETTER KHAKASSIAN CHE
{ 0x04CD, 0x04CE }, // CYRILLIC CAPITAL LETTER EM WITH TAIL
+ { 0x04D0, 0x04D1 }, // CYRILLIC CAPITAL LETTER A WITH BREVE
+ { 0x04D2, 0x04D3 }, // CYRILLIC CAPITAL LETTER A WITH DIAERESIS
{ 0x04D4, 0x04D5 }, // CYRILLIC CAPITAL LIGATURE A IE
+ { 0x04D6, 0x04D7 }, // CYRILLIC CAPITAL LETTER IE WITH BREVE
{ 0x04D8, 0x04D9 }, // CYRILLIC CAPITAL LETTER SCHWA
+ { 0x04DA, 0x04DB }, // CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS
+ { 0x04DC, 0x04DD }, // CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS
+ { 0x04DE, 0x04DF }, // CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS
{ 0x04E0, 0x04E1 }, // CYRILLIC CAPITAL LETTER ABKHASIAN DZE
+ { 0x04E2, 0x04E3 }, // CYRILLIC CAPITAL LETTER I WITH MACRON
+ { 0x04E4, 0x04E5 }, // CYRILLIC CAPITAL LETTER I WITH DIAERESIS
+ { 0x04E6, 0x04E7 }, // CYRILLIC CAPITAL LETTER O WITH DIAERESIS
{ 0x04E8, 0x04E9 }, // CYRILLIC CAPITAL LETTER BARRED O
+ { 0x04EA, 0x04EB }, // CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS
+ { 0x04EC, 0x04ED }, // CYRILLIC CAPITAL LETTER E WITH DIAERESIS
+ { 0x04EE, 0x04EF }, // CYRILLIC CAPITAL LETTER U WITH MACRON
+ { 0x04F0, 0x04F1 }, // CYRILLIC CAPITAL LETTER U WITH DIAERESIS
+ { 0x04F2, 0x04F3 }, // CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
+ { 0x04F4, 0x04F5 }, // CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS
{ 0x04F6, 0x04F7 }, // CYRILLIC CAPITAL LETTER GHE WITH DESCENDER
+ { 0x04F8, 0x04F9 }, // CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS
{ 0x04FA, 0x04FB }, // CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK
{ 0x04FC, 0x04FD }, // CYRILLIC CAPITAL LETTER HA WITH HOOK
{ 0x04FE, 0x04FF }, // CYRILLIC CAPITAL LETTER HA WITH STROKE
diff --git a/native/jni/src/char_utils.h b/native/jni/src/char_utils.h
index 58d388dbf..b429f40b2 100644
--- a/native/jni/src/char_utils.h
+++ b/native/jni/src/char_utils.h
@@ -58,7 +58,8 @@ inline static int toBaseCodePoint(int c) {
AK_FORCE_INLINE static int toLowerCase(const int c) {
if (isAsciiUpper(c)) {
return toAsciiLower(c);
- } else if (isAscii(c)) {
+ }
+ if (isAscii(c)) {
return c;
}
return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
diff --git a/native/jni/src/defines.h b/native/jni/src/defines.h
index a7b023a75..6ef9f414b 100644
--- a/native/jni/src/defines.h
+++ b/native/jni/src/defines.h
@@ -424,10 +424,9 @@ typedef enum {
CT_OMISSION,
CT_INSERTION,
CT_TRANSPOSITION,
- CT_SPACE_SUBSTITUTION,
- CT_SPACE_OMISSION,
CT_COMPLETION,
CT_TERMINAL,
- CT_NEW_WORD,
+ CT_NEW_WORD_SPACE_OMITTION,
+ CT_NEW_WORD_SPACE_SUBSTITUTION,
} CorrectionType;
#endif // LATINIME_DEFINES_H
diff --git a/native/jni/src/digraph_utils.cpp b/native/jni/src/digraph_utils.cpp
index 6a1ab0271..083442669 100644
--- a/native/jni/src/digraph_utils.cpp
+++ b/native/jni/src/digraph_utils.cpp
@@ -14,6 +14,7 @@
* limitations under the License.
*/
+#include "char_utils.h"
#include "binary_format.h"
#include "defines.h"
#include "digraph_utils.h"
@@ -120,10 +121,11 @@ const DigraphUtils::DigraphType DigraphUtils::USED_DIGRAPH_TYPES[] =
/* static */ const DigraphUtils::digraph_t *DigraphUtils::getDigraphForDigraphTypeAndCodePoint(
const DigraphUtils::DigraphType digraphType, const int compositeGlyphCodePoint) {
const DigraphUtils::digraph_t *digraphs = 0;
+ const int compositeGlyphLowerCodePoint = toLowerCase(compositeGlyphCodePoint);
const int digraphsSize =
DigraphUtils::getAllDigraphsForDictionaryAndReturnSize(digraphType, &digraphs);
for (int i = 0; i < digraphsSize; i++) {
- if (digraphs[i].compositeGlyph == compositeGlyphCodePoint) {
+ if (digraphs[i].compositeGlyph == compositeGlyphLowerCodePoint) {
return &digraphs[i];
}
}
diff --git a/native/jni/src/suggest/core/dicnode/dic_node.h b/native/jni/src/suggest/core/dicnode/dic_node.h
index 32faae52c..f8d2df452 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node.h
@@ -360,11 +360,6 @@ class DicNode {
return mDicNodeState.mDicNodeStateScoring.getCompoundDistance(languageWeight);
}
- // Note that "cost" means delta for "distance" that is weighted.
- float getTotalPrevWordsLanguageCost() const {
- return mDicNodeState.mDicNodeStateScoring.getTotalPrevWordsLanguageCost();
- }
-
// Used to commit input partially
int getPrevWordNodePos() const {
return mDicNodeState.mDicNodeStatePrevWord.getPrevWordNodePos();
diff --git a/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h b/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h
index 8902d3122..fd9d610e3 100644
--- a/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h
+++ b/native/jni/src/suggest/core/dicnode/dic_node_state_scoring.h
@@ -31,7 +31,7 @@ class DicNodeStateScoring {
mDigraphIndex(DigraphUtils::NOT_A_DIGRAPH_INDEX),
mEditCorrectionCount(0), mProximityCorrectionCount(0),
mNormalizedCompoundDistance(0.0f), mSpatialDistance(0.0f), mLanguageDistance(0.0f),
- mTotalPrevWordsLanguageCost(0.0f), mRawLength(0.0f) {
+ mRawLength(0.0f) {
}
virtual ~DicNodeStateScoring() {}
@@ -42,7 +42,6 @@ class DicNodeStateScoring {
mNormalizedCompoundDistance = 0.0f;
mSpatialDistance = 0.0f;
mLanguageDistance = 0.0f;
- mTotalPrevWordsLanguageCost = 0.0f;
mRawLength = 0.0f;
mDoubleLetterLevel = NOT_A_DOUBLE_LETTER;
mDigraphIndex = DigraphUtils::NOT_A_DIGRAPH_INDEX;
@@ -54,7 +53,6 @@ class DicNodeStateScoring {
mNormalizedCompoundDistance = scoring->mNormalizedCompoundDistance;
mSpatialDistance = scoring->mSpatialDistance;
mLanguageDistance = scoring->mLanguageDistance;
- mTotalPrevWordsLanguageCost = scoring->mTotalPrevWordsLanguageCost;
mRawLength = scoring->mRawLength;
mDoubleLetterLevel = scoring->mDoubleLetterLevel;
mDigraphIndex = scoring->mDigraphIndex;
@@ -70,9 +68,6 @@ class DicNodeStateScoring {
if (isProximityCorrection) {
++mProximityCorrectionCount;
}
- if (languageCost > 0.0f) {
- setTotalPrevWordsLanguageCost(mTotalPrevWordsLanguageCost + languageCost);
- }
}
void addRawLength(const float rawLength) {
@@ -148,10 +143,6 @@ class DicNodeStateScoring {
}
}
- float getTotalPrevWordsLanguageCost() const {
- return mTotalPrevWordsLanguageCost;
- }
-
private:
// Caution!!!
// Use a default copy constructor and an assign operator because shallow copies are ok
@@ -165,7 +156,6 @@ class DicNodeStateScoring {
float mNormalizedCompoundDistance;
float mSpatialDistance;
float mLanguageDistance;
- float mTotalPrevWordsLanguageCost;
float mRawLength;
AK_FORCE_INLINE void addDistance(float spatialDistance, float languageDistance,
@@ -179,11 +169,6 @@ class DicNodeStateScoring {
/ static_cast<float>(max(1, totalInputIndex));
}
}
-
- //TODO: remove
- AK_FORCE_INLINE void setTotalPrevWordsLanguageCost(float totalPrevWordsLanguageCost) {
- mTotalPrevWordsLanguageCost = totalPrevWordsLanguageCost;
- }
};
} // namespace latinime
#endif // LATINIME_DIC_NODE_STATE_SCORING_H
diff --git a/native/jni/src/suggest/core/policy/weighting.cpp b/native/jni/src/suggest/core/policy/weighting.cpp
index e62b70423..b9c0b8129 100644
--- a/native/jni/src/suggest/core/policy/weighting.cpp
+++ b/native/jni/src/suggest/core/policy/weighting.cpp
@@ -38,7 +38,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_SUBSTITUTION:
PROF_SUBSTITUTION(node->mProfiler);
return;
- case CT_NEW_WORD:
+ case CT_NEW_WORD_SPACE_OMITTION:
PROF_NEW_WORD(node->mProfiler);
return;
case CT_MATCH:
@@ -50,7 +50,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_TERMINAL:
PROF_TERMINAL(node->mProfiler);
return;
- case CT_SPACE_SUBSTITUTION:
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
PROF_SPACE_SUBSTITUTION(node->mProfiler);
return;
case CT_INSERTION:
@@ -107,16 +107,16 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_SUBSTITUTION:
// only used for typing
return weighting->getSubstitutionCost();
- case CT_NEW_WORD:
- return weighting->getNewWordCost(dicNode);
+ case CT_NEW_WORD_SPACE_OMITTION:
+ return weighting->getNewWordCost(traverseSession, dicNode);
case CT_MATCH:
return weighting->getMatchedCost(traverseSession, dicNode, inputStateG);
case CT_COMPLETION:
return weighting->getCompletionCost(traverseSession, dicNode);
case CT_TERMINAL:
return weighting->getTerminalSpatialCost(traverseSession, dicNode);
- case CT_SPACE_SUBSTITUTION:
- return weighting->getSpaceSubstitutionCost();
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
+ return weighting->getSpaceSubstitutionCost(traverseSession, dicNode);
case CT_INSERTION:
return weighting->getInsertionCost(traverseSession, parentDicNode, dicNode);
case CT_TRANSPOSITION:
@@ -135,7 +135,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return 0.0f;
case CT_SUBSTITUTION:
return 0.0f;
- case CT_NEW_WORD:
+ case CT_NEW_WORD_SPACE_OMITTION:
return weighting->getNewWordBigramCost(traverseSession, parentDicNode, bigramCacheMap);
case CT_MATCH:
return 0.0f;
@@ -147,8 +147,8 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
traverseSession->getOffsetDict(), dicNode, bigramCacheMap);
return weighting->getTerminalLanguageCost(traverseSession, dicNode, languageImprobability);
}
- case CT_SPACE_SUBSTITUTION:
- return 0.0f;
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
+ return weighting->getNewWordBigramCost(traverseSession, parentDicNode, bigramCacheMap);
case CT_INSERTION:
return 0.0f;
case CT_TRANSPOSITION:
@@ -168,7 +168,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
case CT_SUBSTITUTION:
// Should return true?
return false;
- case CT_NEW_WORD:
+ case CT_NEW_WORD_SPACE_OMITTION:
return false;
case CT_MATCH:
return false;
@@ -176,7 +176,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return false;
case CT_TERMINAL:
return false;
- case CT_SPACE_SUBSTITUTION:
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
return false;
case CT_INSERTION:
return true;
@@ -197,7 +197,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return false;
case CT_SUBSTITUTION:
return false;
- case CT_NEW_WORD:
+ case CT_NEW_WORD_SPACE_OMITTION:
return false;
case CT_MATCH:
return weighting->isProximityDicNode(traverseSession, dicNode);
@@ -205,7 +205,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return false;
case CT_TERMINAL:
return false;
- case CT_SPACE_SUBSTITUTION:
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
return false;
case CT_INSERTION:
return false;
@@ -224,7 +224,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return 0;
case CT_SUBSTITUTION:
return 0;
- case CT_NEW_WORD:
+ case CT_NEW_WORD_SPACE_OMITTION:
return 0;
case CT_MATCH:
return 1;
@@ -232,7 +232,7 @@ static inline void profile(const CorrectionType correctionType, DicNode *const n
return 0;
case CT_TERMINAL:
return 0;
- case CT_SPACE_SUBSTITUTION:
+ case CT_NEW_WORD_SPACE_SUBSTITUTION:
return 1;
case CT_INSERTION:
return 2;
diff --git a/native/jni/src/suggest/core/policy/weighting.h b/native/jni/src/suggest/core/policy/weighting.h
index b92dbe278..bce479c51 100644
--- a/native/jni/src/suggest/core/policy/weighting.h
+++ b/native/jni/src/suggest/core/policy/weighting.h
@@ -56,7 +56,8 @@ class Weighting {
const DicTraverseSession *const traverseSession,
const DicNode *const parentDicNode, const DicNode *const dicNode) const = 0;
- virtual float getNewWordCost(const DicNode *const dicNode) const = 0;
+ virtual float getNewWordCost(const DicTraverseSession *const traverseSession,
+ const DicNode *const dicNode) const = 0;
virtual float getNewWordBigramCost(
const DicTraverseSession *const traverseSession, const DicNode *const dicNode,
@@ -76,7 +77,8 @@ class Weighting {
virtual float getSubstitutionCost() const = 0;
- virtual float getSpaceSubstitutionCost() const = 0;
+ virtual float getSpaceSubstitutionCost(const DicTraverseSession *const traverseSession,
+ const DicNode *const dicNode) const = 0;
Weighting() {}
virtual ~Weighting() {}
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.cpp b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
index 5b783a2ba..3c44db21c 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.cpp
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.cpp
@@ -16,6 +16,7 @@
#include "suggest/core/session/dic_traverse_session.h"
+#include "binary_format.h"
#include "defines.h"
#include "dictionary.h"
#include "dic_traverse_wrapper.h"
@@ -63,6 +64,7 @@ static TraverseSessionFactoryRegisterer traverseSessionFactoryRegisterer;
void DicTraverseSession::init(const Dictionary *const dictionary, const int *prevWord,
int prevWordLength) {
mDictionary = dictionary;
+ mMultiWordCostMultiplier = BinaryFormat::getMultiWordCostMultiplier(mDictionary->getDict());
if (!prevWord) {
mPrevWordPos = NOT_VALID_WORD;
return;
diff --git a/native/jni/src/suggest/core/session/dic_traverse_session.h b/native/jni/src/suggest/core/session/dic_traverse_session.h
index fe0527639..d9c2a51d0 100644
--- a/native/jni/src/suggest/core/session/dic_traverse_session.h
+++ b/native/jni/src/suggest/core/session/dic_traverse_session.h
@@ -36,7 +36,8 @@ class DicTraverseSession {
AK_FORCE_INLINE DicTraverseSession(JNIEnv *env, jstring localeStr)
: mPrevWordPos(NOT_VALID_WORD), mProximityInfo(0),
mDictionary(0), mDicNodesCache(), mBigramCacheMap(),
- mInputSize(0), mPartiallyCommited(false), mMaxPointerCount(1) {
+ mInputSize(0), mPartiallyCommited(false), mMaxPointerCount(1),
+ mMultiWordCostMultiplier(1.0f) {
// NOTE: mProximityInfoStates is an array of instances.
// No need to initialize it explicitly here.
}
@@ -52,6 +53,7 @@ class DicTraverseSession {
const int maxPointerCount);
void resetCache(const int nextActiveCacheSize, const int maxWords);
+ // TODO: Remove
const uint8_t *getOffsetDict() const;
int getDictFlags() const;
@@ -150,6 +152,10 @@ class DicTraverseSession {
return mProximityInfoStates[0].touchPositionCorrectionEnabled();
}
+ float getMultiWordCostMultiplier() const {
+ return mMultiWordCostMultiplier;
+ }
+
private:
DISALLOW_IMPLICIT_CONSTRUCTORS(DicTraverseSession);
// threshold to start caching
@@ -170,6 +176,11 @@ class DicTraverseSession {
int mInputSize;
bool mPartiallyCommited;
int mMaxPointerCount;
+
+ /////////////////////////////////
+ // Configuration per dictionary
+ float mMultiWordCostMultiplier;
+
};
} // namespace latinime
#endif // LATINIME_DIC_TRAVERSE_SESSION_H
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 67d351fa1..9de2cd2e2 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -33,16 +33,9 @@
namespace latinime {
// Initialization of class constants.
-const int Suggest::LOOKAHEAD_DIC_NODES_CACHE_SIZE = 25;
const int Suggest::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
const int Suggest::MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE = 2;
const float Suggest::AUTOCORRECT_CLASSIFICATION_THRESHOLD = 0.33f;
-const float Suggest::AUTOCORRECT_LANGUAGE_FEATURE_THRESHOLD = 0.6f;
-
-const bool Suggest::CORRECT_SPACE_OMISSION = true;
-const bool Suggest::CORRECT_TRANSPOSITION = true;
-const bool Suggest::CORRECT_INSERTION = true;
-const bool Suggest::CORRECT_OMISSION_G = true;
/**
* Returns a set of suggestions for the given input touch points. The commitPoint argument indicates
@@ -270,12 +263,8 @@ void Suggest::expandCurrentDicNodes(DicTraverseSession *traverseSession) const {
// latest touch point yet. These are needed to apply look-ahead correction operations
// that require special handling of the latest touch point. For example, with insertions
// (e.g., "thiis" -> "this") the latest touch point should not be consumed at all.
- if (CORRECT_TRANSPOSITION) {
- processDicNodeAsTransposition(traverseSession, &dicNode);
- }
- if (CORRECT_INSERTION) {
- processDicNodeAsInsertion(traverseSession, &dicNode);
- }
+ processDicNodeAsTransposition(traverseSession, &dicNode);
+ processDicNodeAsInsertion(traverseSession, &dicNode);
} else { // !isLookAheadCorrection
// Only consider typing error corrections if the normalized compound distance is
// below a spatial distance threshold.
@@ -531,13 +520,10 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
DicNode newDicNode;
DicNodeUtils::initAsRootWithPreviousWord(traverseSession->getDicRootPos(),
traverseSession->getOffsetDict(), dicNode, &newDicNode);
- Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_NEW_WORD, traverseSession, dicNode,
+ const CorrectionType correctionType = spaceSubstitution ?
+ CT_NEW_WORD_SPACE_SUBSTITUTION : CT_NEW_WORD_SPACE_OMITTION;
+ Weighting::addCostAndForwardInputIndex(WEIGHTING, correctionType, traverseSession, dicNode,
&newDicNode, traverseSession->getBigramCacheMap());
- if (spaceSubstitution) {
- // Merge this with CT_NEW_WORD
- Weighting::addCostAndForwardInputIndex(WEIGHTING, CT_SPACE_SUBSTITUTION,
- traverseSession, 0, &newDicNode, 0 /* bigramCacheMap */);
- }
traverseSession->getDicTraverseCache()->copyPushNextActive(&newDicNode);
}
} // namespace latinime
diff --git a/native/jni/src/suggest/core/suggest.h b/native/jni/src/suggest/core/suggest.h
index becd6c1de..875cbe4e0 100644
--- a/native/jni/src/suggest/core/suggest.h
+++ b/native/jni/src/suggest/core/suggest.h
@@ -76,31 +76,16 @@ class Suggest : public SuggestInterface {
void processDicNodeAsMatch(DicTraverseSession *traverseSession,
DicNode *childDicNode) const;
- // Dic nodes cache size for lookahead (autocompletion)
- static const int LOOKAHEAD_DIC_NODES_CACHE_SIZE;
- // Max characters to lookahead
- static const int MAX_LOOKAHEAD;
// Inputs longer than this will autocorrect if the suggestion is multi-word
static const int MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT;
static const int MIN_CONTINUOUS_SUGGESTION_INPUT_SIZE;
- // Base value for converting costs into scores (low so will not autocorrect without classifier)
- static const float BASE_OUTPUT_SCORE;
// Threshold for autocorrection classifier
static const float AUTOCORRECT_CLASSIFICATION_THRESHOLD;
- // Threshold for computing the language model feature for autocorrect classification
- static const float AUTOCORRECT_LANGUAGE_FEATURE_THRESHOLD;
-
- // Typing error correction settings
- static const bool CORRECT_SPACE_OMISSION;
- static const bool CORRECT_TRANSPOSITION;
- static const bool CORRECT_INSERTION;
const Traversal *const TRAVERSAL;
const Scoring *const SCORING;
const Weighting *const WEIGHTING;
-
- static const bool CORRECT_OMISSION_G;
};
} // namespace latinime
#endif // LATINIME_SUGGEST_IMPL_H
diff --git a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
index 0fa684f01..11ccf1773 100644
--- a/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/scoring_params.cpp
@@ -35,17 +35,17 @@ const float ScoringParams::INSERTION_COST = 0.670f;
const float ScoringParams::INSERTION_COST_SAME_CHAR = 0.526f;
const float ScoringParams::INSERTION_COST_FIRST_CHAR = 0.563f;
const float ScoringParams::TRANSPOSITION_COST = 0.494f;
-const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.239f;
+const float ScoringParams::SPACE_SUBSTITUTION_COST = 0.289f;
const float ScoringParams::ADDITIONAL_PROXIMITY_COST = 0.380f;
const float ScoringParams::SUBSTITUTION_COST = 0.363f;
-const float ScoringParams::COST_NEW_WORD = 0.054f;
+const float ScoringParams::COST_NEW_WORD = 0.024f;
const float ScoringParams::COST_NEW_WORD_CAPITALIZED = 0.174f;
const float ScoringParams::DISTANCE_WEIGHT_LANGUAGE = 1.123f;
const float ScoringParams::COST_FIRST_LOOKAHEAD = 0.462f;
const float ScoringParams::COST_LOOKAHEAD = 0.092f;
const float ScoringParams::HAS_PROXIMITY_TERMINAL_COST = 0.126f;
const float ScoringParams::HAS_EDIT_CORRECTION_TERMINAL_COST = 0.056f;
-const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.136f;
+const float ScoringParams::HAS_MULTI_WORD_TERMINAL_COST = 0.536f;
const float ScoringParams::TYPING_BASE_OUTPUT_SCORE = 1.0f;
const float ScoringParams::TYPING_MAX_OUTPUT_SCORE_PER_INPUT = 0.1f;
const float ScoringParams::MAX_NORM_DISTANCE_FOR_EDIT = 0.1f;
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.cpp b/native/jni/src/suggest/policyimpl/typing/typing_traversal.cpp
index 66f8ba9fa..e7e40e34d 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.cpp
+++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.cpp
@@ -18,7 +18,7 @@
namespace latinime {
const bool TypingTraversal::CORRECT_OMISSION = true;
-const bool TypingTraversal::CORRECT_SPACE_SUBSTITUTION = true;
-const bool TypingTraversal::CORRECT_SPACE_OMISSION = true;
+const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_SUBSTITUTION = true;
+const bool TypingTraversal::CORRECT_NEW_WORD_SPACE_OMISSION = true;
const TypingTraversal TypingTraversal::sInstance;
} // namespace latinime
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
index f22029a2c..9f8347452 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_traversal.h
@@ -66,7 +66,7 @@ class TypingTraversal : public Traversal {
AK_FORCE_INLINE bool isSpaceSubstitutionTerminal(
const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const {
- if (!CORRECT_SPACE_SUBSTITUTION) {
+ if (!CORRECT_NEW_WORD_SPACE_SUBSTITUTION) {
return false;
}
if (!canDoLookAheadCorrection(traverseSession, dicNode)) {
@@ -80,7 +80,7 @@ class TypingTraversal : public Traversal {
AK_FORCE_INLINE bool isSpaceOmissionTerminal(
const DicTraverseSession *const traverseSession, const DicNode *const dicNode) const {
- if (!CORRECT_SPACE_OMISSION) {
+ if (!CORRECT_NEW_WORD_SPACE_OMISSION) {
return false;
}
const int inputSize = traverseSession->getInputSize();
@@ -173,8 +173,8 @@ class TypingTraversal : public Traversal {
private:
DISALLOW_COPY_AND_ASSIGN(TypingTraversal);
static const bool CORRECT_OMISSION;
- static const bool CORRECT_SPACE_SUBSTITUTION;
- static const bool CORRECT_SPACE_OMISSION;
+ static const bool CORRECT_NEW_WORD_SPACE_SUBSTITUTION;
+ static const bool CORRECT_NEW_WORD_SPACE_OMISSION;
static const TypingTraversal sInstance;
TypingTraversal() {}
diff --git a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
index 2dcee343f..34d25ae1a 100644
--- a/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
+++ b/native/jni/src/suggest/policyimpl/typing/typing_weighting.h
@@ -128,17 +128,19 @@ class TypingWeighting : public Weighting {
return cost + weightedDistance;
}
- float getNewWordCost(const DicNode *const dicNode) const {
+ float getNewWordCost(const DicTraverseSession *const traverseSession,
+ const DicNode *const dicNode) const {
const bool isCapitalized = dicNode->isCapitalized();
- return isCapitalized ?
+ const float cost = isCapitalized ?
ScoringParams::COST_NEW_WORD_CAPITALIZED : ScoringParams::COST_NEW_WORD;
+ return cost * traverseSession->getMultiWordCostMultiplier();
}
float getNewWordBigramCost(
const DicTraverseSession *const traverseSession, const DicNode *const dicNode,
hash_map_compat<int, int16_t> *const bigramCacheMap) const {
return DicNodeUtils::getBigramNodeImprobability(traverseSession->getOffsetDict(),
- dicNode, bigramCacheMap);
+ dicNode, bigramCacheMap) * ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
}
float getCompletionCost(const DicTraverseSession *const traverseSession,
@@ -162,13 +164,8 @@ class TypingWeighting : public Weighting {
// because the input word shouldn't be treated as perfect
const bool isExactMatch = !hasEditCount && !hasMultipleWords
&& !hasProximityErrors && isSameLength;
-
- const float totalPrevWordsLanguageCost = dicNode->getTotalPrevWordsLanguageCost();
const float languageImprobability = isExactMatch ? 0.0f : dicNodeLanguageImprobability;
- const float languageWeight = ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
- // TODO: Caveat: The following equation should be:
- // totalPrevWordsLanguageCost + (languageImprobability * languageWeight);
- return (totalPrevWordsLanguageCost + languageImprobability) * languageWeight;
+ return languageImprobability * ScoringParams::DISTANCE_WEIGHT_LANGUAGE;
}
AK_FORCE_INLINE bool needsToNormalizeCompoundDistance() const {
@@ -183,8 +180,13 @@ class TypingWeighting : public Weighting {
return ScoringParams::SUBSTITUTION_COST;
}
- AK_FORCE_INLINE float getSpaceSubstitutionCost() const {
- return ScoringParams::SPACE_SUBSTITUTION_COST;
+ AK_FORCE_INLINE float getSpaceSubstitutionCost(
+ const DicTraverseSession *const traverseSession,
+ const DicNode *const dicNode) const {
+ const bool isCapitalized = dicNode->isCapitalized();
+ const float cost = ScoringParams::SPACE_SUBSTITUTION_COST + (isCapitalized ?
+ ScoringParams::COST_NEW_WORD_CAPITALIZED : ScoringParams::COST_NEW_WORD);
+ return cost * traverseSession->getMultiWordCostMultiplier();
}
private: