diff options
author | 2013-01-21 19:39:37 +0000 | |
---|---|---|
committer | 2013-01-21 19:39:38 +0000 | |
commit | 8b788374dee56dfe95e7af42a358923cfcb3668e (patch) | |
tree | 0a13064948752aaa94f85a67cc9e08dd02abc817 /java/src/com/android/inputmethod/research/MainLogBuffer.java | |
parent | 08d9c996dc31e767aca5c31df7ede80a36361060 (diff) | |
parent | 80685aa4b95173638c7982dbac723b282292a931 (diff) | |
download | latinime-8b788374dee56dfe95e7af42a358923cfcb3668e.tar.gz latinime-8b788374dee56dfe95e7af42a358923cfcb3668e.tar.xz latinime-8b788374dee56dfe95e7af42a358923cfcb3668e.zip |
Merge "[Rlog78b] Make log privacy filtering decisions on n-grams"
Diffstat (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java')
-rw-r--r-- | java/src/com/android/inputmethod/research/MainLogBuffer.java | 129 |
1 files changed, 90 insertions, 39 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index a32336d57..57d5c41d7 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -22,6 +22,7 @@ import com.android.inputmethod.latin.Dictionary; import com.android.inputmethod.latin.Suggest; import com.android.inputmethod.latin.define.ProductionFlag; +import java.util.ArrayList; import java.util.LinkedList; import java.util.Random; @@ -56,19 +57,24 @@ import java.util.Random; * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded * n-gram containing dictionary words. */ -public class MainLogBuffer extends FixedLogBuffer { +public abstract class MainLogBuffer extends FixedLogBuffer { private static final String TAG = MainLogBuffer.class.getSimpleName(); private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG; // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. public static final int N_GRAM_SIZE = 2; - // The number of words between n-grams to omit from the log. If debugging, record 50% of all - // words. Otherwise, only record 10%. + + // Whether all words should be recorded, leaving unsampled word between bigrams. Useful for + // testing. + /* package for test */ static final boolean IS_LOGGING_EVERYTHING = false + && ProductionFlag.IS_EXPERIMENTAL_DEBUG; + + // The number of words between n-grams to omit from the log. private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES = - ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18; + IS_LOGGING_EVERYTHING ? 0 : (DEBUG ? 2 : 18); - private final ResearchLog mResearchLog; private Suggest mSuggest; + private boolean mIsStopping = false; /* package for test */ int mNumWordsBetweenNGrams; @@ -76,9 +82,8 @@ public class MainLogBuffer extends FixedLogBuffer { // after a sample is taken. /* package for test */ int mNumWordsUntilSafeToSample; - public MainLogBuffer(final ResearchLog researchLog) { + public MainLogBuffer() { super(N_GRAM_SIZE + DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES); - mResearchLog = researchLog; mNumWordsBetweenNGrams = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES; final Random random = new Random(); mNumWordsUntilSafeToSample = DEBUG ? 0 : random.nextInt(mNumWordsBetweenNGrams + 1); @@ -92,6 +97,10 @@ public class MainLogBuffer extends FixedLogBuffer { mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; } + public void setIsStopping() { + mIsStopping = true; + } + /** * Determines whether uploading the n words at the front the MainLogBuffer will not violate * user privacy. @@ -103,16 +112,36 @@ public class MainLogBuffer extends FixedLogBuffer { * the screen orientation and other characteristics about the device can be uploaded without * revealing much about the user. */ - public boolean isNGramSafe() { + private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { + // Bypass privacy checks when debugging. + if (IS_LOGGING_EVERYTHING) { + if (mIsStopping) { + return true; + } else { + // Only check that it is the right length. If not, wait for later words to make + // complete n-grams. + int numWordsInLogUnitList = 0; + final int length = logUnits.size(); + for (int i = 0; i < length; i++) { + final LogUnit logUnit = logUnits.get(i); + final String word = logUnit.getWord(); + if (word != null) { + numWordsInLogUnitList++; + } + } + return numWordsInLogUnitList >= minNGramSize; + } + } + // Check that we are not sampling too frequently. Having sampled recently might disclose // too much of the user's intended meaning. if (mNumWordsUntilSafeToSample > 0) { return false; } if (mSuggest == null || !mSuggest.hasMainDictionary()) { - // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word - // is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to - // potentially pose a privacy risk. + // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a + // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer + // contents to potentially pose a privacy risk. return false; } // Reload the dictionary in case it has changed (e.g., because the user has changed @@ -121,12 +150,12 @@ public class MainLogBuffer extends FixedLogBuffer { if (dictionary == null) { return false; } - // Check each word in the buffer. If any word poses a privacy threat, we cannot upload the - // complete buffer contents in detail. - final LinkedList<LogUnit> logUnits = getLogUnits(); + + // Check each word in the buffer. If any word poses a privacy threat, we cannot upload + // the complete buffer contents in detail. + int numWordsInLogUnitList = 0; final int length = logUnits.size(); - int wordsNeeded = N_GRAM_SIZE; - for (int i = 0; i < length && wordsNeeded > 0; i++) { + for (int i = 0; i < length; i++) { final LogUnit logUnit = logUnits.get(i); final String word = logUnit.getWord(); if (word == null) { @@ -135,6 +164,7 @@ public class MainLogBuffer extends FixedLogBuffer { return false; } } else { + numWordsInLogUnitList++; // Words not in the dictionary are a privacy threat. if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { if (DEBUG) { @@ -145,38 +175,59 @@ public class MainLogBuffer extends FixedLogBuffer { } } } - // All checks have passed; this buffer's content can be safely uploaded. - return true; + + // Finally, only return true if the minNGramSize is met. + return numWordsInLogUnitList >= minNGramSize; } - public boolean isNGramComplete() { + public void shiftAndPublishAll() { final LinkedList<LogUnit> logUnits = getLogUnits(); - final int length = logUnits.size(); - int wordsNeeded = N_GRAM_SIZE; - for (int i = 0; i < length && wordsNeeded > 0; i++) { - final LogUnit logUnit = logUnits.get(i); - final String word = logUnit.getWord(); - if (word != null) { - wordsNeeded--; - } + while (!logUnits.isEmpty()) { + publishLogUnitsAtFrontOfBuffer(); } - return wordsNeeded == 0; } @Override - protected void onShiftOut(final LogUnit logUnit) { - if (mResearchLog != null) { - mResearchLog.publish(logUnit, - ResearchLogger.IS_LOGGING_EVERYTHING /* isIncludingPrivateData */); - } - if (logUnit.hasWord()) { - if (mNumWordsUntilSafeToSample > 0) { - mNumWordsUntilSafeToSample--; - Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); - } + protected final void onBufferFull() { + publishLogUnitsAtFrontOfBuffer(); + } + + protected final void publishLogUnitsAtFrontOfBuffer() { + ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); + if (isSafeNGram(logUnits, N_GRAM_SIZE)) { + // Good n-gram at the front of the buffer. Publish it, disclosing details. + publish(logUnits, true /* canIncludePrivateData */); + shiftOutWords(N_GRAM_SIZE); + resetWordCounter(); + } else { + // No good n-gram at front, and buffer is full. Shift out the first word (or if there + // is none, the existing logUnits). + logUnits = peekAtFirstNWords(1); + publish(logUnits, false /* canIncludePrivateData */); + shiftOutWords(1); } + } + + /** + * Called when a list of logUnits should be published. + * + * It is the subclass's responsibility to implement the publication. + * + * @param logUnits The list of logUnits to be published. + * @param canIncludePrivateData Whether the private data in the logUnits can be included in + * publication. + */ + protected abstract void publish(final ArrayList<LogUnit> logUnits, + final boolean canIncludePrivateData); + + @Override + protected void shiftOutWords(int numWords) { + int oldNumActualWords = getNumActualWords(); + super.shiftOutWords(numWords); + int numWordsShifted = oldNumActualWords - getNumActualWords(); + mNumWordsUntilSafeToSample -= numWordsShifted; if (DEBUG) { - Log.d(TAG, "shiftedOut " + (logUnit.hasWord() ? logUnit.getWord() : "")); + Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample); } } } |