1 files changed, 158 insertions, 61 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java
index 745768d35..45b83dd76 100644
--- a/java/src/com/android/inputmethod/research/MainLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java
@@ -1,75 +1,104 @@
 /*
  * Copyright (C) 2012 The Android Open Source Project
  *
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
  */
 
 package com.android.inputmethod.research;
 
+import android.util.Log;
+
 import com.android.inputmethod.latin.Dictionary;
 import com.android.inputmethod.latin.Suggest;
+import com.android.inputmethod.latin.define.ProductionFlag;
 
+import java.util.ArrayList;
+import java.util.LinkedList;
 import java.util.Random;
 
-public class MainLogBuffer extends LogBuffer {
+/**
+ * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
+ *
+ * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
+ * be logged in enough detail to determine their contents, 2) only a subset of words are logged
+ * in detail, such as 10%, and 3) no numbers are logged.
+ *
+ * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
+ * words, they are added here.  But if the user backs up over their current word to edit a word
+ * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
+ * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
+ * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
+ * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
+ * a user can perform.
+ *
+ * To balance these requirements (keep history so user can edit, flush history so it does not pile
+ * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
+ * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
+ * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
+ * However, the additional non-detailed words are retained, in case the user backspaces to edit
+ * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
+ * as new words arrive.  After enough non-detailed words have been pushed out to account for the
+ * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
+ *
+ * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
+ * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
+ * dictionary words.
+ *
+ * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
+ * n-gram containing dictionary words.
+ */
+public abstract class MainLogBuffer extends FixedLogBuffer {
+    private static final String TAG = MainLogBuffer.class.getSimpleName();
+    private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
+
     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
-    private static final int N_GRAM_SIZE = 2;
-    // The number of words between n-grams to omit from the log.
-    private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = 18;
+    public static final int N_GRAM_SIZE = 2;
 
-    private final ResearchLog mResearchLog;
     private Suggest mSuggest;
+    private boolean mIsStopping = false;
 
-    // The minimum periodicity with which n-grams can be sampled.  E.g. mWinWordPeriod is 10 if
-    // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
-    // for 11-18, and the bigram at words 19 and 20.  If an n-gram is not safe (e.g. it  contains a
-    // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
-    // n-gram does appear.
-    /* package for test */ int mMinWordPeriod;
+    /* package for test */ int mNumWordsBetweenNGrams;
 
     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
     // after a sample is taken.
-    /* package for test */ int mWordsUntilSafeToSample;
-
-    public MainLogBuffer(final ResearchLog researchLog) {
-        super(N_GRAM_SIZE);
-        mResearchLog = researchLog;
-        mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
-        final Random random = new Random();
-        mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
+    /* package for test */ int mNumWordsUntilSafeToSample;
+
+    public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore) {
+        super(N_GRAM_SIZE + wordsBetweenSamples);
+        mNumWordsBetweenNGrams = wordsBetweenSamples;
+        mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
     }
 
-    public void setSuggest(Suggest suggest) {
+    public void setSuggest(final Suggest suggest) {
         mSuggest = suggest;
     }
 
-    @Override
-    public void shiftIn(final LogUnit newLogUnit) {
-        super.shiftIn(newLogUnit);
-        if (newLogUnit.hasWord()) {
-            if (mWordsUntilSafeToSample > 0) {
-                mWordsUntilSafeToSample--;
-            }
-        }
+    private Dictionary getDictionary() {
+        if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
+        return mSuggest.getMainDictionary();
     }
 
     public void resetWordCounter() {
-        mWordsUntilSafeToSample = mMinWordPeriod;
+        mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
+    }
+
+    public void setIsStopping() {
+        mIsStopping = true;
     }
 
     /**
-     * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
-     * form and still protect the user's privacy.
+     * Determines whether uploading the n words at the front the MainLogBuffer will not violate
+     * user privacy.
      *
      * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
      * non-character data that is typed between words.  The decision about privacy is made based on
@@ -78,50 +107,118 @@ public class MainLogBuffer extends LogBuffer {
      * the screen orientation and other characteristics about the device can be uploaded without
      * revealing much about the user.
      */
-    public boolean isSafeToLog() {
+    private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
+        // Bypass privacy checks when debugging.
+        if (ResearchLogger.IS_LOGGING_EVERYTHING) {
+            if (mIsStopping) {
+                return true;
+            }
+            // Only check that it is the right length.  If not, wait for later words to make
+            // complete n-grams.
+            int numWordsInLogUnitList = 0;
+            final int length = logUnits.size();
+            for (int i = 0; i < length; i++) {
+                final LogUnit logUnit = logUnits.get(i);
+                final String word = logUnit.getWord();
+                if (word != null) {
+                    numWordsInLogUnitList++;
+                }
+            }
+            return numWordsInLogUnitList >= minNGramSize;
+        }
+
         // Check that we are not sampling too frequently.  Having sampled recently might disclose
         // too much of the user's intended meaning.
-        if (mWordsUntilSafeToSample > 0) {
-            return false;
-        }
-        if (mSuggest == null || !mSuggest.hasMainDictionary()) {
-            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a word
-            // is out-of-vocabulary or not.  Therefore, we must judge the entire buffer contents to
-            // potentially pose a privacy risk.
+        if (mNumWordsUntilSafeToSample > 0) {
             return false;
         }
         // Reload the dictionary in case it has changed (e.g., because the user has changed
         // languages).
-        final Dictionary dictionary = mSuggest.getMainDictionary();
+        final Dictionary dictionary = getDictionary();
         if (dictionary == null) {
+            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
+            // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
+            // contents to potentially pose a privacy risk.
             return false;
         }
-        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload the
-        // complete buffer contents in detail.
-        final int length = mLogUnits.size();
+
+        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
+        // the complete buffer contents in detail.
+        int numWordsInLogUnitList = 0;
+        final int length = logUnits.size();
         for (int i = 0; i < length; i++) {
-            final LogUnit logUnit = mLogUnits.get(i);
-            final String word = logUnit.getWord();
-            if (word == null) {
+            final LogUnit logUnit = logUnits.get(i);
+            if (!logUnit.hasWord()) {
                 // Digits outside words are a privacy threat.
-                if (logUnit.hasDigit()) {
+                if (logUnit.mayContainDigit()) {
                     return false;
                 }
             } else {
+                numWordsInLogUnitList++;
+                final String word = logUnit.getWord();
                 // Words not in the dictionary are a privacy threat.
-                if (!(dictionary.isValidWord(word))) {
+                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
+                    if (DEBUG) {
+                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
+                                + ", isValid: " + (dictionary.isValidWord(word)));
+                    }
                     return false;
                 }
             }
         }
-        // All checks have passed; this buffer's content can be safely uploaded.
-        return true;
+
+        // Finally, only return true if the minNGramSize is met.
+        return numWordsInLogUnitList >= minNGramSize;
+    }
+
+    public void shiftAndPublishAll() {
+        final LinkedList<LogUnit> logUnits = getLogUnits();
+        while (!logUnits.isEmpty()) {
+            publishLogUnitsAtFrontOfBuffer();
+        }
+    }
+
+    @Override
+    protected final void onBufferFull() {
+        publishLogUnitsAtFrontOfBuffer();
+    }
+
+    protected final void publishLogUnitsAtFrontOfBuffer() {
+        ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
+        if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
+            // Good n-gram at the front of the buffer.  Publish it, disclosing details.
+            publish(logUnits, true /* canIncludePrivateData */);
+            shiftOutWords(N_GRAM_SIZE);
+            resetWordCounter();
+        } else {
+            // No good n-gram at front, and buffer is full.  Shift out the first word (or if there
+            // is none, the existing logUnits).
+            logUnits = peekAtFirstNWords(1);
+            publish(logUnits, false /* canIncludePrivateData */);
+            shiftOutWords(1);
+        }
     }
 
+    /**
+     * Called when a list of logUnits should be published.
+     *
+     * It is the subclass's responsibility to implement the publication.
+     *
+     * @param logUnits The list of logUnits to be published.
+     * @param canIncludePrivateData Whether the private data in the logUnits can be included in
+     * publication.
+     */
+    protected abstract void publish(final ArrayList<LogUnit> logUnits,
+            final boolean canIncludePrivateData);
+
     @Override
-    protected void onShiftOut(LogUnit logUnit) {
-        if (mResearchLog != null) {
-            mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
+    protected void shiftOutWords(final int numWords) {
+        final int oldNumActualWords = getNumActualWords();
+        super.shiftOutWords(numWords);
+        final int numWordsShifted = oldNumActualWords - getNumActualWords();
+        mNumWordsUntilSafeToSample -= numWordsShifted;
+        if (DEBUG) {
+            Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
         }
     }
 }