java/src/com/android/inputmethod/research/MainLogBuffer.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225

/*
 * Copyright (C) 2012 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.android.inputmethod.research;

import android.util.Log;

import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.Suggest;
import com.android.inputmethod.latin.define.ProductionFlag;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.Random;

/**
 * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
 *
 * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
 * be logged in enough detail to determine their contents, 2) only a subset of words are logged
 * in detail, such as 10%, and 3) no numbers are logged.
 *
 * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
 * words, they are added here.  But if the user backs up over their current word to edit a word
 * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
 * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
 * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
 * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
 * a user can perform.
 *
 * To balance these requirements (keep history so user can edit, flush history so it does not pile
 * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
 * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
 * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
 * However, the additional non-detailed words are retained, in case the user backspaces to edit
 * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
 * as new words arrive.  After enough non-detailed words have been pushed out to account for the
 * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
 *
 * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
 * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
 * dictionary words.
 *
 * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
 * n-gram containing dictionary words.
 */
public abstract class MainLogBuffer extends FixedLogBuffer {
    private static final String TAG = MainLogBuffer.class.getSimpleName();
    private static final boolean DEBUG = false
            && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;

    // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
    public static final int N_GRAM_SIZE = 2;

    private Suggest mSuggest;
    private boolean mIsStopping = false;

    /* package for test */ int mNumWordsBetweenNGrams;

    // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
    // after a sample is taken.
    /* package for test */ int mNumWordsUntilSafeToSample;

    public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore) {
        super(N_GRAM_SIZE + wordsBetweenSamples);
        mNumWordsBetweenNGrams = wordsBetweenSamples;
        mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
    }

    public void setSuggest(final Suggest suggest) {
        mSuggest = suggest;
    }

    private Dictionary getDictionary() {
        if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
        return mSuggest.getMainDictionary();
    }

    public void resetWordCounter() {
        mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
    }

    public void setIsStopping() {
        mIsStopping = true;
    }

    /**
     * Determines whether uploading the n words at the front the MainLogBuffer will not violate
     * user privacy.
     *
     * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
     * non-character data that is typed between words.  The decision about privacy is made based on
     * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
     * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
     * the screen orientation and other characteristics about the device can be uploaded without
     * revealing much about the user.
     */
    private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
        // Bypass privacy checks when debugging.
        if (ResearchLogger.IS_LOGGING_EVERYTHING) {
            if (mIsStopping) {
                return true;
            }
            // Only check that it is the right length.  If not, wait for later words to make
            // complete n-grams.
            int numWordsInLogUnitList = 0;
            final int length = logUnits.size();
            for (int i = 0; i < length; i++) {
                final LogUnit logUnit = logUnits.get(i);
                final String word = logUnit.getWord();
                if (word != null) {
                    numWordsInLogUnitList++;
                }
            }
            return numWordsInLogUnitList >= minNGramSize;
        }

        // Check that we are not sampling too frequently.  Having sampled recently might disclose
        // too much of the user's intended meaning.
        if (mNumWordsUntilSafeToSample > 0) {
            return false;
        }
        // Reload the dictionary in case it has changed (e.g., because the user has changed
        // languages).
        final Dictionary dictionary = getDictionary();
        if (dictionary == null) {
            // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
            // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
            // contents to potentially pose a privacy risk.
            return false;
        }

        // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
        // the complete buffer contents in detail.
        int numWordsInLogUnitList = 0;
        final int length = logUnits.size();
        for (int i = 0; i < length; i++) {
            final LogUnit logUnit = logUnits.get(i);
            if (!logUnit.hasWord()) {
                // Digits outside words are a privacy threat.
                if (logUnit.mayContainDigit()) {
                    return false;
                }
            } else {
                numWordsInLogUnitList++;
                final String word = logUnit.getWord();
                // Words not in the dictionary are a privacy threat.
                if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
                    if (DEBUG) {
                        Log.d(TAG, "NOT SAFE!: hasLetters: " + ResearchLogger.hasLetters(word)
                                + ", isValid: " + (dictionary.isValidWord(word)));
                    }
                    return false;
                }
            }
        }

        // Finally, only return true if the minNGramSize is met.
        return numWordsInLogUnitList >= minNGramSize;
    }

    public void shiftAndPublishAll() {
        final LinkedList<LogUnit> logUnits = getLogUnits();
        while (!logUnits.isEmpty()) {
            publishLogUnitsAtFrontOfBuffer();
        }
    }

    @Override
    protected final void onBufferFull() {
        publishLogUnitsAtFrontOfBuffer();
    }

    protected final void publishLogUnitsAtFrontOfBuffer() {
        ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
        if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
            // Good n-gram at the front of the buffer.  Publish it, disclosing details.
            publish(logUnits, true /* canIncludePrivateData */);
            shiftOutWords(N_GRAM_SIZE);
            resetWordCounter();
        } else {
            // No good n-gram at front, and buffer is full.  Shift out the first word (or if there
            // is none, the existing logUnits).
            logUnits = peekAtFirstNWords(1);
            publish(logUnits, false /* canIncludePrivateData */);
            shiftOutWords(1);
        }
    }

    /**
     * Called when a list of logUnits should be published.
     *
     * It is the subclass's responsibility to implement the publication.
     *
     * @param logUnits The list of logUnits to be published.
     * @param canIncludePrivateData Whether the private data in the logUnits can be included in
     * publication.
     */
    protected abstract void publish(final ArrayList<LogUnit> logUnits,
            final boolean canIncludePrivateData);

    @Override
    protected void shiftOutWords(final int numWords) {
        final int oldNumActualWords = getNumActualWords();
        super.shiftOutWords(numWords);
        final int numWordsShifted = oldNumActualWords - getNumActualWords();
        mNumWordsUntilSafeToSample -= numWordsShifted;
        if (DEBUG) {
            Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
        }
    }
}