aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-09-29 10:52:18 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2014-09-29 10:52:18 +0900
commitbb0eca57054758ef17b032d2654c1fc5f6b32101 (patch)
tree25a22c28dad09d7e2bdbbb95098ca54de67aced7 /java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java
parentbbd6a26be025bc419e342e32d86629c4ebd68dd8 (diff)
downloadlatinime-bb0eca57054758ef17b032d2654c1fc5f6b32101.tar.gz
latinime-bb0eca57054758ef17b032d2654c1fc5f6b32101.tar.xz
latinime-bb0eca57054758ef17b032d2654c1fc5f6b32101.zip
Rename PrevWordsInfo to NgramContext.
Bug: 14425059 Change-Id: Id06a71681fa8b5e589e29fba10fe5c1cfed66984
Diffstat (limited to 'java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java')
-rw-r--r--java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java103
1 files changed, 103 insertions, 0 deletions
diff --git a/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java b/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java
new file mode 100644
index 000000000..34eeac2c2
--- /dev/null
+++ b/java/src/com/android/inputmethod/latin/utils/NgramContextUtils.java
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.inputmethod.latin.utils;
+
+import java.util.Arrays;
+import java.util.regex.Pattern;
+
+import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.NgramContext;
+import com.android.inputmethod.latin.NgramContext.WordInfo;
+import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
+
+public final class NgramContextUtils {
+ private NgramContextUtils() {
+ // Intentional empty constructor for utility class.
+ }
+
+ private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");
+ // Get context information from nth word before the cursor. n = 1 retrieves the words
+ // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits
+ // on whitespace only.
+ // Also, it won't return words that end in a separator (if the nth word before the cursor
+ // ends in a separator, it returns information representing beginning-of-sentence).
+ // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):
+ // (n = 1) "abc def|" -> abc, def
+ // (n = 1) "abc def |" -> abc, def
+ // (n = 1) "abc 'def|" -> empty, 'def
+ // (n = 1) "abc def. |" -> beginning-of-sentence
+ // (n = 1) "abc def . |" -> beginning-of-sentence
+ // (n = 2) "abc def|" -> beginning-of-sentence, abc
+ // (n = 2) "abc def |" -> beginning-of-sentence, abc
+ // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot
+ // represent this situation using NgramContext. See TODO in the method.
+ // TODO: The next example's result should be "abc, def". This have to be fixed before we
+ // retrieve the prior context of Beginning-of-Sentence.
+ // (n = 2) "abc def. |" -> beginning-of-sentence, abc
+ // (n = 2) "abc def . |" -> abc, def
+ // (n = 2) "abc|" -> beginning-of-sentence
+ // (n = 2) "abc |" -> beginning-of-sentence
+ // (n = 2) "abc. def|" -> beginning-of-sentence
+ public static NgramContext getNgramContextFromNthPreviousWord(final CharSequence prev,
+ final SpacingAndPunctuations spacingAndPunctuations, final int n) {
+ if (prev == null) return NgramContext.EMPTY_PREV_WORDS_INFO;
+ final String[] w = SPACE_REGEX.split(prev);
+ final WordInfo[] prevWordsInfo = new WordInfo[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+ Arrays.fill(prevWordsInfo, WordInfo.EMPTY_WORD_INFO);
+ for (int i = 0; i < prevWordsInfo.length; i++) {
+ final int focusedWordIndex = w.length - n - i;
+ // Referring to the word after the focused word.
+ if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {
+ final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];
+ if (!wordFollowingTheNthPrevWord.isEmpty()) {
+ final char firstChar = wordFollowingTheNthPrevWord.charAt(0);
+ if (spacingAndPunctuations.isWordConnector(firstChar)) {
+ // The word following the focused word is starting with a word connector.
+ // TODO: Return meaningful context for this case.
+ break;
+ }
+ }
+ }
+ // If we can't find (n + i) words, the context is beginning-of-sentence.
+ if (focusedWordIndex < 0) {
+ prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+ break;
+ }
+ final String focusedWord = w[focusedWordIndex];
+ // If the word is, the context is beginning-of-sentence.
+ final int length = focusedWord.length();
+ if (length <= 0) {
+ prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+ break;
+ }
+ // If ends in a sentence separator, the context is beginning-of-sentence.
+ final char lastChar = focusedWord.charAt(length - 1);
+ if (spacingAndPunctuations.isSentenceSeparator(lastChar)) {
+ prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+ break;
+ }
+ // If ends in a word separator or connector, the context is unclear.
+ // TODO: Return meaningful context for this case.
+ if (spacingAndPunctuations.isWordSeparator(lastChar)
+ || spacingAndPunctuations.isWordConnector(lastChar)) {
+ break;
+ }
+ prevWordsInfo[i] = new WordInfo(focusedWord);
+ }
+ return new NgramContext(prevWordsInfo);
+ }
+}