diff options
Diffstat (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java')
-rw-r--r-- | java/src/com/android/inputmethod/research/MainLogBuffer.java | 219 |
1 files changed, 158 insertions, 61 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index 745768d35..45b83dd76 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -1,75 +1,104 @@ /* * Copyright (C) 2012 The Android Open Source Project * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ package com.android.inputmethod.research; +import android.util.Log; + import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.Suggest; +import com.android.inputmethod.latin.define.ProductionFlag; +import java.util.ArrayList; +import java.util.LinkedList; import java.util.Random; -public class MainLogBuffer extends LogBuffer { +/** + * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees. + * + * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to + * be logged in enough detail to determine their contents, 2) only a subset of words are logged + * in detail, such as 10%, and 3) no numbers are logged. + * + * This class maintains a list of LogUnits, each corresponding to a word. As the user completes + * words, they are added here. But if the user backs up over their current word to edit a word + * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of + * the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled + * back out even after they are pushed in, we must not publish the contents of this LogBuffer too + * quickly. However, we cannot let the contents pile up either, or it will limit the editing that + * a user can perform. + * + * To balance these requirements (keep history so user can edit, flush history so it does not pile + * up), the LogBuffer is considered "complete" when the user has entered enough words to form an + * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above). + * Once complete, the n-gram may be published to flash storage (via the ResearchLog class). + * However, the additional non-detailed words are retained, in case the user backspaces to edit + * them. The MainLogBuffer then continues to add words, publishing individual non-detailed words + * as new words arrive. After enough non-detailed words have been pushed out to account for the + * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again. + * + * If the words that would form the valid n-gram are not in the dictionary, then words are pushed + * through the LogBuffer one at a time until an n-gram is found that is entirely composed of + * dictionary words. + * + * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded + * n-gram containing dictionary words. + */ +public abstract class MainLogBuffer extends FixedLogBuffer { + private static final String TAG = MainLogBuffer.class.getSimpleName(); + private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; + // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. - private static final int N_GRAM_SIZE = 2; - // The number of words between n-grams to omit from the log. - private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = 18; + public static final int N_GRAM_SIZE = 2; - private final ResearchLog mResearchLog; private Suggest mSuggest; + private boolean mIsStopping = false; - // The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if - // every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc. - // for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a - // number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe - // n-gram does appear. - /* package for test */ int mMinWordPeriod; + /* package for test */ int mNumWordsBetweenNGrams; // Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod // after a sample is taken. - /* package for test */ int mWordsUntilSafeToSample; - - public MainLogBuffer(final ResearchLog researchLog) { - super(N_GRAM_SIZE); - mResearchLog = researchLog; - mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE; - final Random random = new Random(); - mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod); + /* package for test */ int mNumWordsUntilSafeToSample; + + public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore) { + super(N_GRAM_SIZE + wordsBetweenSamples); + mNumWordsBetweenNGrams = wordsBetweenSamples; + mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore; } - public void setSuggest(Suggest suggest) { + public void setSuggest(final Suggest suggest) { mSuggest = suggest; } - @Override - public void shiftIn(final LogUnit newLogUnit) { - super.shiftIn(newLogUnit); - if (newLogUnit.hasWord()) { - if (mWordsUntilSafeToSample > 0) { - mWordsUntilSafeToSample--; - } - } + private Dictionary getDictionary() { + if (mSuggest == null || !mSuggest.hasMainDictionary()) return null; + return mSuggest.getMainDictionary(); } public void resetWordCounter() { - mWordsUntilSafeToSample = mMinWordPeriod; + mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; + } + + public void setIsStopping() { + mIsStopping = true; } /** - * Determines whether the content of the MainLogBuffer can be safely uploaded in its complete - * form and still protect the user's privacy. + * Determines whether uploading the n words at the front the MainLogBuffer will not violate + * user privacy. * * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any * non-character data that is typed between words. The decision about privacy is made based on @@ -78,50 +107,118 @@ public class MainLogBuffer extends LogBuffer { * the screen orientation and other characteristics about the device can be uploaded without * revealing much about the user. */ - public boolean isSafeToLog() { + private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { + // Bypass privacy checks when debugging. + if (ResearchLogger.IS_LOGGING_EVERYTHING) { + if (mIsStopping) { + return true; + } + // Only check that it is the right length. If not, wait for later words to make + // complete n-grams. + int numWordsInLogUnitList = 0; + final int length = logUnits.size(); + for (int i = 0; i < length; i++) { + final LogUnit logUnit = logUnits.get(i); + final String word = logUnit.getWord(); + if (word != null) { + numWordsInLogUnitList++; + } + } + return numWordsInLogUnitList >= minNGramSize; + } + // Check that we are not sampling too frequently. Having sampled recently might disclose // too much of the user's intended meaning. - if (mWordsUntilSafeToSample > 0) { - return false; - } - if (mSuggest == null || !mSuggest.hasMainDictionary()) { - // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word - // is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to - // potentially pose a privacy risk. + if (mNumWordsUntilSafeToSample > 0) { return false; } // Reload the dictionary in case it has changed (e.g., because the user has changed // languages). - final Dictionary dictionary = mSuggest.getMainDictionary(); + final Dictionary dictionary = getDictionary(); if (dictionary == null) { + // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a + // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer + // contents to potentially pose a privacy risk. return false; } - // Check each word in the buffer. If any word poses a privacy threat, we cannot upload the - // complete buffer contents in detail. - final int length = mLogUnits.size(); + + // Check each word in the buffer. If any word poses a privacy threat, we cannot upload + // the complete buffer contents in detail. + int numWordsInLogUnitList = 0; + final int length = logUnits.size(); for (int i = 0; i < length; i++) { - final LogUnit logUnit = mLogUnits.get(i); - final String word = logUnit.getWord(); - if (word == null) { + final LogUnit logUnit = logUnits.get(i); + if (!logUnit.hasWord()) { // Digits outside words are a privacy threat. - if (logUnit.hasDigit()) { + if (logUnit.mayContainDigit()) { return false; } } else { + numWordsInLogUnitList++; + final String word = logUnit.getWord(); // Words not in the dictionary are a privacy threat. - if (!(dictionary.isValidWord(word))) { + if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { + if (DEBUG) { + Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word) + + ", isValid: " + (dictionary.isValidWord(word))); + } return false; } } } - // All checks have passed; this buffer's content can be safely uploaded. - return true; + + // Finally, only return true if the minNGramSize is met. + return numWordsInLogUnitList >= minNGramSize; + } + + public void shiftAndPublishAll() { + final LinkedList<LogUnit> logUnits = getLogUnits(); + while (!logUnits.isEmpty()) { + publishLogUnitsAtFrontOfBuffer(); + } + } + + @Override + protected final void onBufferFull() { + publishLogUnitsAtFrontOfBuffer(); + } + + protected final void publishLogUnitsAtFrontOfBuffer() { + ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); + if (isSafeNGram(logUnits, N_GRAM_SIZE)) { + // Good n-gram at the front of the buffer. Publish it, disclosing details. + publish(logUnits, true /* canIncludePrivateData */); + shiftOutWords(N_GRAM_SIZE); + resetWordCounter(); + } else { + // No good n-gram at front, and buffer is full. Shift out the first word (or if there + // is none, the existing logUnits). + logUnits = peekAtFirstNWords(1); + publish(logUnits, false /* canIncludePrivateData */); + shiftOutWords(1); + } } + /** + * Called when a list of logUnits should be published. + * + * It is the subclass's responsibility to implement the publication. + * + * @param logUnits The list of logUnits to be published. + * @param canIncludePrivateData Whether the private data in the logUnits can be included in + * publication. + */ + protected abstract void publish(final ArrayList<LogUnit> logUnits, + final boolean canIncludePrivateData); + @Override - protected void onShiftOut(LogUnit logUnit) { - if (mResearchLog != null) { - mResearchLog.publish(logUnit, false /* isIncludingPrivateData */); + protected void shiftOutWords(final int numWords) { + final int oldNumActualWords = getNumActualWords(); + super.shiftOutWords(numWords); + final int numWordsShifted = oldNumActualWords - getNumActualWords(); + mNumWordsUntilSafeToSample -= numWordsShifted; + if (DEBUG) { + Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); } } } |