aboutsummaryrefslogtreecommitdiffstats
path: root/java/src/com/android/inputmethod/research/MainLogBuffer.java
diff options
context:
space:
mode:
Diffstat (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java')
-rw-r--r--java/src/com/android/inputmethod/research/MainLogBuffer.java72
1 files changed, 55 insertions, 17 deletions
diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java
index 9aa349906..3482153b4 100644
--- a/java/src/com/android/inputmethod/research/MainLogBuffer.java
+++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java
@@ -63,6 +63,15 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
private static final boolean DEBUG = false
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
+ // Keep consistent with switch statement in Statistics.recordPublishabilityResultCode()
+ public static final int PUBLISHABILITY_PUBLISHABLE = 0;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_STOPPING = 1;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT = 2;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY = 3;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE = 4;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT = 5;
+ public static final int PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY = 6;
+
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
public static final int N_GRAM_SIZE = 2;
@@ -105,21 +114,24 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
}
/**
- * Determines whether uploading the n words at the front the MainLogBuffer will not violate
- * user privacy.
+ * Determines whether the string determined by a series of LogUnits will not violate user
+ * privacy if published.
+ *
+ * @param logUnits a LogUnit list to check for publishability
+ * @param nGramSize the smallest n-gram acceptable to be published. if
+ * {@link ResearchLogger.IS_LOGGING_EVERYTHING} is true, then publish if there are more than
+ * {@code minNGramSize} words in the logUnits, otherwise wait. if {@link
+ * ResearchLogger.IS_LOGGING_EVERYTHING} is false, then ensure that there are exactly nGramSize
+ * words in the LogUnits.
*
- * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
- * non-character data that is typed between words. The decision about privacy is made based on
- * the buffer's entire content. If it is decided that the privacy risks are too great to upload
- * the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
- * the screen orientation and other characteristics about the device can be uploaded without
- * revealing much about the user.
+ * @return one of the {@code PUBLISHABILITY_*} result codes defined in this class.
*/
- private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
+ private int getPublishabilityResultCode(final ArrayList<LogUnit> logUnits,
+ final int nGramSize) {
// Bypass privacy checks when debugging.
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
if (mIsStopping) {
- return true;
+ return PUBLISHABILITY_UNPUBLISHABLE_STOPPING;
}
// Only check that it is the right length. If not, wait for later words to make
// complete n-grams.
@@ -129,13 +141,17 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
final LogUnit logUnit = logUnits.get(i);
numWordsInLogUnitList += logUnit.getNumWords();
}
- return numWordsInLogUnitList >= minNGramSize;
+ if (numWordsInLogUnitList >= nGramSize) {
+ return PUBLISHABILITY_PUBLISHABLE;
+ } else {
+ return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
+ }
}
// Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning.
if (mNumWordsUntilSafeToSample > 0) {
- return false;
+ return PUBLISHABILITY_UNPUBLISHABLE_SAMPLED_TOO_RECENTLY;
}
// Reload the dictionary in case it has changed (e.g., because the user has changed
// languages).
@@ -144,7 +160,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
// contents to potentially pose a privacy risk.
- return false;
+ return PUBLISHABILITY_UNPUBLISHABLE_DICTIONARY_UNAVAILABLE;
}
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
@@ -155,7 +171,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
if (!logUnit.hasOneOrMoreWords()) {
// Digits outside words are a privacy threat.
if (logUnit.mayContainDigit()) {
- return false;
+ return PUBLISHABILITY_UNPUBLISHABLE_MAY_CONTAIN_DIGIT;
}
} else {
numWordsInLogUnitList += logUnit.getNumWords();
@@ -168,14 +184,18 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
+ ResearchLogger.hasLetters(word)
+ ", isValid: " + (dictionary.isValidWord(word)));
}
- return false;
+ return PUBLISHABILITY_UNPUBLISHABLE_NOT_IN_DICTIONARY;
}
}
}
}
// Finally, only return true if the ngram is the right size.
- return numWordsInLogUnitList == minNGramSize;
+ if (numWordsInLogUnitList == nGramSize) {
+ return PUBLISHABILITY_PUBLISHABLE;
+ } else {
+ return PUBLISHABILITY_UNPUBLISHABLE_INCORRECT_WORD_COUNT;
+ }
}
public void shiftAndPublishAll() throws IOException {
@@ -196,11 +216,29 @@ public abstract class MainLogBuffer extends FixedLogBuffer {
}
}
+ /**
+ * If there is a safe n-gram at the front of this log buffer, publish it with all details, and
+ * remove the LogUnits that constitute it.
+ *
+ * An n-gram might not be "safe" if it violates privacy controls. E.g., it might contain
+ * numbers, an out-of-vocabulary word, or another n-gram may have been published recently. If
+ * there is no safe n-gram, then the LogUnits up through the first word-containing LogUnit are
+ * published, but without disclosing any privacy-related details, such as the word the LogUnit
+ * generated, motion data, etc.
+ *
+ * Note that a LogUnit can hold more than one word if the user types without explicit spaces.
+ * In this case, the words may be grouped together in such a way that pulling an n-gram off the
+ * front would require splitting a LogUnit. Splitting a LogUnit is not possible, so this case
+ * is treated just as the unsafe n-gram case. This may cause n-grams to be sampled at slightly
+ * less than the target frequency.
+ */
protected final void publishLogUnitsAtFrontOfBuffer() throws IOException {
// TODO: Refactor this method to require fewer passes through the LogUnits. Should really
// require only one pass.
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
- if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
+ final int publishabilityResultCode = getPublishabilityResultCode(logUnits, N_GRAM_SIZE);
+ ResearchLogger.recordPublishabilityResultCode(publishabilityResultCode);
+ if (publishabilityResultCode == MainLogBuffer.PUBLISHABILITY_PUBLISHABLE) {
// Good n-gram at the front of the buffer. Publish it, disclosing details.
publish(logUnits, true /* canIncludePrivateData */);
shiftOutWords(N_GRAM_SIZE);