From e92b5e145f74808ff778a42dc5ba979aa27343ca Mon Sep 17 00:00:00 2001 From: Kurt Partridge Date: Mon, 15 Apr 2013 18:41:59 -0700 Subject: Allow LogUnits to hold >1 word LogUnits have been annotated with the autocorrected words, but until now this was assumed to be a single word without spaces. But spaceless typing can result in spaces in the LogUnit label. With this change, the LogUnit inspects the autocorrected text to determine how many words were inserted, and counts them accurately. This change corrects a privacy problem, which was that if the word sampling algorithm chose a LogUnit that actually contained multiple words, then more than two successive words would be included in the log. Change-Id: I7c01c3dd3ac33d7e96c00836256bae9c14b124ed --- .../inputmethod/research/MainLogBuffer.java | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'java/src/com/android/inputmethod/research/MainLogBuffer.java') diff --git a/java/src/com/android/inputmethod/research/MainLogBuffer.java b/java/src/com/android/inputmethod/research/MainLogBuffer.java index cd4c1db6e..42ef5d3b6 100644 --- a/java/src/com/android/inputmethod/research/MainLogBuffer.java +++ b/java/src/com/android/inputmethod/research/MainLogBuffer.java @@ -126,10 +126,7 @@ public abstract class MainLogBuffer extends FixedLogBuffer { final int length = logUnits.size(); for (int i = 0; i < length; i++) { final LogUnit logUnit = logUnits.get(i); - final String word = logUnit.getWord(); - if (word != null) { - numWordsInLogUnitList++; - } + numWordsInLogUnitList += logUnit.getNumWords(); } return numWordsInLogUnitList >= minNGramSize; } @@ -153,29 +150,31 @@ public abstract class MainLogBuffer extends FixedLogBuffer { // the complete buffer contents in detail. int numWordsInLogUnitList = 0; final int length = logUnits.size(); - for (int i = 0; i < length; i++) { - final LogUnit logUnit = logUnits.get(i); - if (!logUnit.hasWord()) { + for (final LogUnit logUnit : logUnits) { + if (!logUnit.hasOneOrMoreWords()) { // Digits outside words are a privacy threat. if (logUnit.mayContainDigit()) { return false; } } else { - numWordsInLogUnitList++; - final String word = logUnit.getWord(); - // Words not in the dictionary are a privacy threat. - if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { - if (DEBUG) { - Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word) - + ", isValid: " + (dictionary.isValidWord(word))); + numWordsInLogUnitList += logUnit.getNumWords(); + final String[] words = logUnit.getWordsAsStringArray(); + for (final String word : words) { + // Words not in the dictionary are a privacy threat. + if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) { + if (DEBUG) { + Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: " + + ResearchLogger.hasLetters(word) + + ", isValid: " + (dictionary.isValidWord(word))); + } + return false; } - return false; } } } - // Finally, only return true if the minNGramSize is met. - return numWordsInLogUnitList >= minNGramSize; + // Finally, only return true if the ngram is the right size. + return numWordsInLogUnitList == minNGramSize; } public void shiftAndPublishAll() { @@ -198,11 +197,14 @@ public abstract class MainLogBuffer extends FixedLogBuffer { shiftOutWords(N_GRAM_SIZE); mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams; } else { - // No good n-gram at front, and buffer is full. Shift out the first word (or if there - // is none, the existing logUnits). - logUnits = peekAtFirstNWords(1); + // No good n-gram at front, and buffer is full. Shift out up through the first logUnit + // with associated words (or if there is none, all the existing logUnits). + logUnits.clear(); + for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords(); + logUnit = shiftOut()) { + logUnits.add(logUnit); + } publish(logUnits, false /* canIncludePrivateData */); - shiftOutWords(1); } } -- cgit v1.2.3-83-g751a