1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
|
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package com.android.inputmethod.research;
import android.util.Log;
import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.Suggest;
import com.android.inputmethod.latin.define.ProductionFlag;
import java.util.Random;
public class MainLogBuffer extends LogBuffer {
private static final String TAG = MainLogBuffer.class.getSimpleName();
private static final boolean DEBUG = false && ProductionFlag.IS_EXPERIMENTAL_DEBUG;
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
private static final int N_GRAM_SIZE = 2;
// The number of words between n-grams to omit from the log.
private static final int DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES =
ProductionFlag.IS_EXPERIMENTAL_DEBUG ? 2 : 18;
private final ResearchLog mResearchLog;
private Suggest mSuggest;
// The minimum periodicity with which n-grams can be sampled. E.g. mWinWordPeriod is 10 if
// every 10th bigram is sampled, i.e., words 1-8 are not, but the bigram at words 9 and 10, etc.
// for 11-18, and the bigram at words 19 and 20. If an n-gram is not safe (e.g. it contains a
// number in the middle or an out-of-vocabulary word), then sampling is delayed until a safe
// n-gram does appear.
/* package for test */ int mMinWordPeriod;
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
// after a sample is taken.
/* package for test */ int mWordsUntilSafeToSample;
public MainLogBuffer(final ResearchLog researchLog) {
super(N_GRAM_SIZE);
mResearchLog = researchLog;
mMinWordPeriod = DEFAULT_NUMBER_OF_WORDS_BETWEEN_SAMPLES + N_GRAM_SIZE;
final Random random = new Random();
mWordsUntilSafeToSample = random.nextInt(mMinWordPeriod);
}
public void setSuggest(Suggest suggest) {
mSuggest = suggest;
}
@Override
public void shiftIn(final LogUnit newLogUnit) {
super.shiftIn(newLogUnit);
if (newLogUnit.hasWord()) {
if (mWordsUntilSafeToSample > 0) {
mWordsUntilSafeToSample--;
}
}
if (DEBUG) {
Log.d(TAG, "shiftedIn " + (newLogUnit.hasWord() ? newLogUnit.getWord() : ""));
}
}
public void resetWordCounter() {
mWordsUntilSafeToSample = mMinWordPeriod;
}
/**
* Determines whether the content of the MainLogBuffer can be safely uploaded in its complete
* form and still protect the user's privacy.
*
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
* non-character data that is typed between words. The decision about privacy is made based on
* the buffer's entire content. If it is decided that the privacy risks are too great to upload
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
* the screen orientation and other characteristics about the device can be uploaded without
* revealing much about the user.
*/
public boolean isSafeToLog() {
// Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning.
if (mWordsUntilSafeToSample > 0) {
return false;
}
if (mSuggest == null || !mSuggest.hasMainDictionary()) {
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a word
// is out-of-vocabulary or not. Therefore, we must judge the entire buffer contents to
// potentially pose a privacy risk.
return false;
}
// Reload the dictionary in case it has changed (e.g., because the user has changed
// languages).
final Dictionary dictionary = mSuggest.getMainDictionary();
if (dictionary == null) {
return false;
}
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload the
// complete buffer contents in detail.
final int length = mLogUnits.size();
for (int i = 0; i < length; i++) {
final LogUnit logUnit = mLogUnits.get(i);
final String word = logUnit.getWord();
if (word == null) {
// Digits outside words are a privacy threat.
if (logUnit.mayContainDigit()) {
return false;
}
} else {
// Words not in the dictionary are a privacy threat.
if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
if (DEBUG) {
Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
+ ", isValid: " + (dictionary.isValidWord(word)));
}
return false;
}
}
}
// All checks have passed; this buffer's content can be safely uploaded.
return true;
}
@Override
protected void onShiftOut(LogUnit logUnit) {
if (mResearchLog != null) {
mResearchLog.publish(logUnit, false /* isIncludingPrivateData */);
}
}
}
|