diff options
Diffstat (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java')
-rw-r--r-- | java/src/com/android/inputmethod/research/MainLogBuffer.java | 72 |
1 files changed, 55 insertions, 17 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index 9aa349906..3482153b4 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer { private static final boolean DEBUG = false && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG; + // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode() + public static final int PUBLISHABILITY_PUBLISHABLE = 0; + public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1; + public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2; + public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3; + public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4; + public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5; + public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6; + // The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams. public static final int N_GRAM_SIZE = 2; @@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer { } /** - * Determines whether uploading the n words at the front the MainLogBuffer will not violate - * user privacy. + * Determines whether the string determined by a series of LogUnits will not violate user + * privacy if published. + * + * @param logUnits a LogUnit list to check for publishability + * @param nGramSize the smallest n-gram acceptable to be published. if + * {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than + * {@code minNGramSize} words in the logUnits, otherwise wait. if {@link + * ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize + * words in the LogUnits. * - * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any - * non-character data that is typed between words. The decision about privacy is made based on - * the buffer's entire content. If it is decided that the privacy risks are too great to upload - * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g., - * the screen orientation and other characteristics about the device can be uploaded without - * revealing much about the user. + * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class. */ - private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) { + private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits, + final int nGramSize) { // Bypass privacy checks when debugging. if (ResearchLogger.IS_LOGGING_EVERYTHING) { if (mIsStopping) { - return true; + return PUBLISHABILITY_UNPUBLISHABLE_STOPPING; } // Only check that it is the right length. If not, wait for later words to make // complete n-grams. @@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer { final LogUnit logUnit = logUnits.get(i); numWordsInLogUnitList += logUnit.getNumWords(); } - return numWordsInLogUnitList >= minNGramSize; + if (numWordsInLogUnitList >= nGramSize) { + return PUBLISHABILITY_PUBLISHABLE; + } else { + return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; + } } // Check that we are not sampling too frequently. Having sampled recently might disclose // too much of the user's intended meaning. if (mNumWordsUntilSafeToSample > 0) { - return false; + return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY; } // Reload the dictionary in case it has changed (e.g., because the user has changed // languages). @@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { // Main dictionary is unavailable. Since we cannot check it, we cannot tell if a // word is out-of-vocabulary or not. Therefore, we must judge the entire buffer // contents to potentially pose a privacy risk. - return false; + return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE; } // Check each word in the buffer. If any word poses a privacy threat, we cannot upload @@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { if (!logUnit.hasOneOrMoreWords()) { // Digits outside words are a privacy threat. if (logUnit.mayContainDigit()) { - return false; + return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT; } } else { numWordsInLogUnitList += logUnit.getNumWords(); @@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer { + ResearchLogger.hasLetters(word) + ", isValid: " + (dictionary.isValidWord(word))); } - return false; + return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY; } } } } // Finally, only return true if the ngram is the right size. - return numWordsInLogUnitList == minNGramSize; + if (numWordsInLogUnitList == nGramSize) { + return PUBLISHABILITY_PUBLISHABLE; + } else { + return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT; + } } public void shiftAndPublishAll() throws IOException { @@ -196,11 +216,29 @@ public abstract class MainLogBuffer extends FixedLogBuffer { } } + /** + * If there is a safe n-gram at the front of this log buffer, publish it with all details, and + * remove the LogUnits that constitute it. + * + * An n-gram might not be "safe" if it violates privacy controls. E.g., it might contain + * numbers, an out-of-vocabulary word, or another n-gram may have been published recently. If + * there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are + * published, but without disclosing any privacy-related details, such as the word the LogUnit + * generated, motion data, etc. + * + * Note that a LogUnit can hold more than one word if the user types without explicit spaces. + * In this case, the words may be grouped together in such a way that pulling an n-gram off the + * front would require splitting a LogUnit. Splitting a LogUnit is not possible, so this case + * is treated just as the unsafe n-gram case. This may cause n-grams to be sampled at slightly + * less than the target frequency. + */ protected final void publishLogUnitsAtFrontOfBuffer() throws IOException { // TODO: Refactor this method to require fewer passes through the LogUnits. Should really // require only one pass. ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE); - if (isSafeNGram(logUnits, N_GRAM_SIZE)) { + final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE); + ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode); + if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) { // Good n-gram at the front of the buffer. Publish it, disclosing details. publish(logUnits, true /* canIncludePrivateData */); shiftOutWords(N_GRAM_SIZE); |