aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/research/MainLogBuffer.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java')
-rw-r--r--java/src/com/android/inputmethod/research/MainLogBuffer.java219
1 files changed, 158 insertions, 61 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java
index 745768d35..45b83dd76 100644
--- a/java/src/com/android/inputmethod/research/MainLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java
@@ -1,75 +1,104 @@
/*
* Copyright (C) 2012 The Android Open Source Project
*
- * Licensed under the Apache License, Version 2.0 (the "License"); you may not
- * use this file except in compliance with the License. You may obtain a copy of
- * the License at
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- * License for the specific language governing permissions and limitations under
- * the License.
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*/
package com.android.inputmethod.research;
+import android.util.Log;
+
import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.Suggest;
+import com.android.inputmethod.latin.define.ProductionFlag;
+import java.util.ArrayList;
+import java.util.LinkedList;
import java.util.Random;
-public class MainLogBuffer extends LogBuffer {
+/**
+ * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
+ *
+ * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
+ * be logged in enough detail to determine their contents, 2) only a subset of words are logged
+ * in detail, such as 10%, and 3) no numbers are logged.
+ *
+ * This class maintains a list of LogUnits, each corresponding to a word. As the user completes
+ * words, they are added here. But if the user backs up over their current word to edit a word
+ * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
+ * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled
+ * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
+ * quickly. However, we cannot let the contents pile up either, or it will limit the editing that
+ * a user can perform.
+ *
+ * To balance these requirements (keep history so user can edit, flush history so it does not pile
+ * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
+ * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
+ * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
+ * However, the additional non-detailed words are retained, in case the user backspaces to edit
+ * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words
+ * as new words arrive. After enough non-detailed words have been pushed out to account for the
+ * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
+ *
+ * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
+ * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
+ * dictionary words.
+ *
+ * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
+ * n-gram containing dictionary words.
+ */
+public abstract class MainLogBuffer extends FixedLogBuffer {
+ private static final String TAG = MainLogBuffer.class.getSimpleName();
+ private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
+
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
- private static final int N_GRAM_SIZE = 2;
- // The number of words between n-grams to omit from the log.
- private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = 18;
+ public static final int N_GRAM_SIZE = 2;
- private final ResearchLog mResearchLog;
private Suggest mSuggest;
+ private boolean mIsStopping = false;
- // The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if
- // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
- // for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a
- // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
- // n-gram does appear.
- /* package for test */ int mMinWordPeriod;
+ /* package for test */ int mNumWordsBetweenNGrams;
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
// after a sample is taken.
- /* package for test */ int mWordsUntilSafeToSample;
-
- public MainLogBuffer(final ResearchLog researchLog) {
- super(N_GRAM_SIZE);
- mResearchLog = researchLog;
- mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
- final Random random = new Random();
- mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
+ /* package for test */ int mNumWordsUntilSafeToSample;
+
+ public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore) {
+ super(N_GRAM_SIZE + wordsBetweenSamples);
+ mNumWordsBetweenNGrams = wordsBetweenSamples;
+ mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
}
- public void setSuggest(Suggest suggest) {
+ public void setSuggest(final Suggest suggest) {
mSuggest = suggest;
}
- @Override
- public void shiftIn(final LogUnit newLogUnit) {
- super.shiftIn(newLogUnit);
- if (newLogUnit.hasWord()) {
- if (mWordsUntilSafeToSample > 0) {
- mWordsUntilSafeToSample--;
- }
- }
+ private Dictionary getDictionary() {
+ if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
+ return mSuggest.getMainDictionary();
}
public void resetWordCounter() {
- mWordsUntilSafeToSample = mMinWordPeriod;
+ mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
+ }
+
+ public void setIsStopping() {
+ mIsStopping = true;
}
/**
- * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
- * form and still protect the user's privacy.
+ * Determines whether uploading the n words at the front the MainLogBuffer will not violate
+ * user privacy.
*
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
* non-character data that is typed between words. The decision about privacy is made based on
@@ -78,50 +107,118 @@ public class MainLogBuffer extends LogBuffer {
* the screen orientation and other characteristics about the device can be uploaded without
* revealing much about the user.
*/
- public boolean isSafeToLog() {
+ private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
+ // Bypass privacy checks when debugging.
+ if (ResearchLogger.IS_LOGGING_EVERYTHING) {
+ if (mIsStopping) {
+ return true;
+ }
+ // Only check that it is the right length. If not, wait for later words to make
+ // complete n-grams.
+ int numWordsInLogUnitList = 0;
+ final int length = logUnits.size();
+ for (int i = 0; i < length; i++) {
+ final LogUnit logUnit = logUnits.get(i);
+ final String word = logUnit.getWord();
+ if (word != null) {
+ numWordsInLogUnitList++;
+ }
+ }
+ return numWordsInLogUnitList >= minNGramSize;
+ }
+
// Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning.
- if (mWordsUntilSafeToSample > 0) {
- return false;
- }
- if (mSuggest == null || !mSuggest.hasMainDictionary()) {
- // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word
- // is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to
- // potentially pose a privacy risk.
+ if (mNumWordsUntilSafeToSample > 0) {
return false;
}
// Reload the dictionary in case it has changed (e.g., because the user has changed
// languages).
- final Dictionary dictionary = mSuggest.getMainDictionary();
+ final Dictionary dictionary = getDictionary();
if (dictionary == null) {
+ // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
+ // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
+ // contents to potentially pose a privacy risk.
return false;
}
- // Check each word in the buffer. If any word poses a privacy threat, we cannot upload the
- // complete buffer contents in detail.
- final int length = mLogUnits.size();
+
+ // Check each word in the buffer. If any word poses a privacy threat, we cannot upload
+ // the complete buffer contents in detail.
+ int numWordsInLogUnitList = 0;
+ final int length = logUnits.size();
for (int i = 0; i < length; i++) {
- final LogUnit logUnit = mLogUnits.get(i);
- final String word = logUnit.getWord();
- if (word == null) {
+ final LogUnit logUnit = logUnits.get(i);
+ if (!logUnit.hasWord()) {
// Digits outside words are a privacy threat.
- if (logUnit.hasDigit()) {
+ if (logUnit.mayContainDigit()) {
return false;
}
} else {
+ numWordsInLogUnitList++;
+ final String word = logUnit.getWord();
// Words not in the dictionary are a privacy threat.
- if (!(dictionary.isValidWord(word))) {
+ if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
+ if (DEBUG) {
+ Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
+ + ", isValid: " + (dictionary.isValidWord(word)));
+ }
return false;
}
}
}
- // All checks have passed; this buffer's content can be safely uploaded.
- return true;
+
+ // Finally, only return true if the minNGramSize is met.
+ return numWordsInLogUnitList >= minNGramSize;
+ }
+
+ public void shiftAndPublishAll() {
+ final LinkedList<LogUnit> logUnits = getLogUnits();
+ while (!logUnits.isEmpty()) {
+ publishLogUnitsAtFrontOfBuffer();
+ }
+ }
+
+ @Override
+ protected final void onBufferFull() {
+ publishLogUnitsAtFrontOfBuffer();
+ }
+
+ protected final void publishLogUnitsAtFrontOfBuffer() {
+ ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
+ if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
+ // Good n-gram at the front of the buffer. Publish it, disclosing details.
+ publish(logUnits, true /* canIncludePrivateData */);
+ shiftOutWords(N_GRAM_SIZE);
+ resetWordCounter();
+ } else {
+ // No good n-gram at front, and buffer is full. Shift out the first word (or if there
+ // is none, the existing logUnits).
+ logUnits = peekAtFirstNWords(1);
+ publish(logUnits, false /* canIncludePrivateData */);
+ shiftOutWords(1);
+ }
}
+ /**
+ * Called when a list of logUnits should be published.
+ *
+ * It is the subclass's responsibility to implement the publication.
+ *
+ * @param logUnits The list of logUnits to be published.
+ * @param canIncludePrivateData Whether the private data in the logUnits can be included in
+ * publication.
+ */
+ protected abstract void publish(final ArrayList<LogUnit> logUnits,
+ final boolean canIncludePrivateData);
+
@Override
- protected void onShiftOut(LogUnit logUnit) {
- if (mResearchLog != null) {
- mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
+ protected void shiftOutWords(final int numWords) {
+ final int oldNumActualWords = getNumActualWords();
+ super.shiftOutWords(numWords);
+ final int numWordsShifted = oldNumActualWords - getNumActualWords();
+ mNumWordsUntilSafeToSample -= numWordsShifted;
+ if (DEBUG) {
+ Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
}
}
}