Move getPrevWordsInfoFromNthPreviousWord to StringUtlis.

Bug: 14425059 Change-Id: I3eb24e840c165e43f68c2a60fccf9974affb57a6
author: Keisuke Kuroyanagi <ksk@google.com> 2014-06-27 18:08:17 +0900
committer: Keisuke Kuroyanagi <ksk@google.com> 2014-06-27 18:08:17 +0900
commit: ba463c9a66f75e8d00f4658e32b763eb54215231 (patch)
tree: edfe7e9f6e7cc4d4242199cd215bc873c4a8cbe1 /java/src/com/android/inputmethod/latin/utils/StringUtils.java
parent: 1c2f1ada8305e36defa8572da687a4596bf083ea (diff)
download: latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.tar.gz
latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.tar.xz
latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.zip
1 files changed, 77 insertions, 0 deletions
diff --git a/java/src/com/android/inputmethod/latin/utils/StringUtils.java b/java/src/com/android/inputmethod/latin/utils/StringUtils.java
index e4237a7f2..bf2571466 100644
--- a/java/src/com/android/inputmethod/latin/utils/StringUtils.java
+++ b/java/src/com/android/inputmethod/latin/utils/StringUtils.java
@@ -22,10 +22,14 @@ import android.text.TextUtils;
 
 import com.android.inputmethod.annotations.UsedForTesting;
 import com.android.inputmethod.latin.Constants;
+import com.android.inputmethod.latin.PrevWordsInfo;
+import com.android.inputmethod.latin.PrevWordsInfo.WordInfo;
+import com.android.inputmethod.latin.settings.SpacingAndPunctuations;
 
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 public final class StringUtils {
     public static final int CAPITALIZE_NONE = 0;  // No caps, or mixed case
@@ -567,4 +571,77 @@ public final class StringUtils {
             return sb + "]";
         }
     }
+
+    private static final Pattern SPACE_REGEX = Pattern.compile("\\s+");
+    // Get context information from nth word before the cursor. n = 1 retrieves the words
+    // immediately before the cursor, n = 2 retrieves the words before that, and so on. This splits
+    // on whitespace only.
+    // Also, it won't return words that end in a separator (if the nth word before the cursor
+    // ends in a separator, it returns information representing beginning-of-sentence).
+    // Example (when Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM is 2):
+    // (n = 1) "abc def|" -> abc, def
+    // (n = 1) "abc def |" -> abc, def
+    // (n = 1) "abc 'def|" -> empty, 'def
+    // (n = 1) "abc def. |" -> beginning-of-sentence
+    // (n = 1) "abc def . |" -> beginning-of-sentence
+    // (n = 2) "abc def|" -> beginning-of-sentence, abc
+    // (n = 2) "abc def |" -> beginning-of-sentence, abc
+    // (n = 2) "abc 'def|" -> empty. The context is different from "abc def", but we cannot
+    // represent this situation using PrevWordsInfo. See TODO in the method.
+    // TODO: The next example's result should be "abc, def". This have to be fixed before we
+    // retrieve the prior context of Beginning-of-Sentence.
+    // (n = 2) "abc def. |" -> beginning-of-sentence, abc
+    // (n = 2) "abc def . |" -> abc, def
+    // (n = 2) "abc|" -> beginning-of-sentence
+    // (n = 2) "abc |" -> beginning-of-sentence
+    // (n = 2) "abc. def|" -> beginning-of-sentence
+    public static PrevWordsInfo getPrevWordsInfoFromNthPreviousWord(final CharSequence prev,
+            final SpacingAndPunctuations spacingAndPunctuations, final int n) {
+        if (prev == null) return PrevWordsInfo.EMPTY_PREV_WORDS_INFO;
+        final String[] w = SPACE_REGEX.split(prev);
+        final WordInfo[] prevWordsInfo = new WordInfo[Constants.MAX_PREV_WORD_COUNT_FOR_N_GRAM];
+        for (int i = 0; i < prevWordsInfo.length; i++) {
+            final int focusedWordIndex = w.length - n - i;
+            // Referring to the word after the focused word.
+            if ((focusedWordIndex + 1) >= 0 && (focusedWordIndex + 1) < w.length) {
+                final String wordFollowingTheNthPrevWord = w[focusedWordIndex + 1];
+                if (!wordFollowingTheNthPrevWord.isEmpty()) {
+                    final char firstChar = wordFollowingTheNthPrevWord.charAt(0);
+                    if (spacingAndPunctuations.isWordConnector(firstChar)) {
+                        // The word following the focused word is starting with a word connector.
+                        // TODO: Return meaningful context for this case.
+                        prevWordsInfo[i] = WordInfo.EMPTY_WORD_INFO;
+                        break;
+                    }
+                }
+            }
+            // If we can't find (n + i) words, the context is beginning-of-sentence.
+            if (focusedWordIndex < 0) {
+                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+                break;
+            }
+            final String focusedWord = w[focusedWordIndex];
+            // If the word is, the context is beginning-of-sentence.
+            final int length = focusedWord.length();
+            if (length <= 0) {
+                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+                break;
+            }
+            // If ends in a sentence separator, the context is beginning-of-sentence.
+            final char lastChar = focusedWord.charAt(length - 1);
+            if (spacingAndPunctuations.isSentenceSeparator(lastChar)) {
+                prevWordsInfo[i] = WordInfo.BEGINNING_OF_SENTENCE;
+                break;
+            }
+            // If ends in a word separator or connector, the context is unclear.
+            // TODO: Return meaningful context for this case.
+            if (spacingAndPunctuations.isWordSeparator(lastChar)
+                    || spacingAndPunctuations.isWordConnector(lastChar)) {
+                prevWordsInfo[i] = WordInfo.EMPTY_WORD_INFO;
+                break;
+            }
+            prevWordsInfo[i] = new WordInfo(focusedWord);
+        }
+        return new PrevWordsInfo(prevWordsInfo);
+    }
 }
author	Keisuke Kuroyanagi <ksk@google.com>	2014-06-27 18:08:17 +0900
committer	Keisuke Kuroyanagi <ksk@google.com>	2014-06-27 18:08:17 +0900
commit	ba463c9a66f75e8d00f4658e32b763eb54215231 (patch)
tree	edfe7e9f6e7cc4d4242199cd215bc873c4a8cbe1 /java/src/com/android/inputmethod/latin/utils/StringUtils.java
parent	1c2f1ada8305e36defa8572da687a4596bf083ea (diff)
download	latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.tar.gz latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.tar.xz latinime-ba463c9a66f75e8d00f4658e32b763eb54215231.zip