Merge "Block offensive words in multi-word suggestions"

author: Adrian Velicu <adrianv@google.com> 2014-10-31 07:31:25 +0000
committer: Android (Google) Code Review <android-gerrit@google.com> 2014-10-31 07:31:25 +0000
commit: 434c08a79ec60347590e26725372834678a8aa84 (patch)
tree: 584af8737119792abfe4d8e2cff34dea9d7d8d17
parent: ed575b62da197e81ace23eeb1af09fd776fa0b41 (diff)
parent: 10416241f7badaedfbafd9858deda9dca496bd08 (diff)
download: latinime-434c08a79ec60347590e26725372834678a8aa84.tar.gz
latinime-434c08a79ec60347590e26725372834678a8aa84.tar.xz
latinime-434c08a79ec60347590e26725372834678a8aa84.zip
3 files changed, 65 insertions, 13 deletions
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
index 3283f6deb..23103b9f7 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.cpp
@@ -76,6 +76,52 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
             weightOfLangModelVsSpatialModelToOutputSuggestions, outSuggestionResults);
 }
 
+/* static */ bool SuggestionsOutputUtils::shouldBlockWord(
+        const SuggestOptions *const suggestOptions, const DicNode *const terminalDicNode,
+        const WordAttributes wordAttributes, const bool isLastWord) {
+    const bool currentWordExactMatch =
+            ErrorTypeUtils::isExactMatch(terminalDicNode->getContainedErrorTypes());
+    // When we have to block offensive words, non-exact matched offensive words should not be
+    // output.
+    const bool shouldBlockOffensiveWords = suggestOptions->blockOffensiveWords();
+
+    const bool isBlockedOffensiveWord = shouldBlockOffensiveWords &&
+            wordAttributes.isPossiblyOffensive();
+
+    // This function is called in two situations:
+    //
+    // 1) At the end of a search, in which case terminalDicNode will point to the last DicNode
+    //    of the search, and isLastWord will be true.
+    //                    "fuck"
+    //                        |
+    //                        \ terminalDicNode (isLastWord=true, currentWordExactMatch=true)
+    //    In this case, if the current word is an exact match, we will always let the word
+    //    through, even if the user is blocking offensive words (it's exactly what they typed!)
+    //
+    // 2) In the middle of the search, when we hit a terminal node, to decide whether or not
+    //    to start a new search at root, to try to match the rest of the input. In this case,
+    //    terminalDicNode will point to the terminal node we just hit, and isLastWord will be
+    //    false.
+    //                    "fuckvthis"
+    //                        |
+    //                        \ terminalDicNode (isLastWord=false, currentWordExactMatch=true)
+    //
+    // In this case, we should NOT allow the match through (correcting "fuckthis" to "fuck this"
+    // when offensive words are blocked would be a bad idea).
+    //
+    // In the case of a multi-word correction where the offensive word is typed last (eg.
+    // for the input "allfuck"), this function will be called with isLastWord==true, but
+    // currentWordExactMatch==false. So we are OK in this case as well.
+    //                    "allfuck"
+    //                           |
+    //                           \ terminalDicNode (isLastWord=true, currentWordExactMatch=false)
+    if (isLastWord && currentWordExactMatch) {
+        return false;
+    } else {
+        return isBlockedOffensiveWord;
+    }
+}
+
 /* static */ void SuggestionsOutputUtils::outputSuggestionsOfDicNode(
         const Scoring *const scoringPolicy, DicTraverseSession *traverseSession,
         const DicNode *const terminalDicNode, const float weightOfLangModelVsSpatialModel,
@@ -98,24 +144,16 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
     const bool isExactMatchWithIntentionalOmission =
             ErrorTypeUtils::isExactMatchWithIntentionalOmission(
                     terminalDicNode->getContainedErrorTypes());
-    const bool isFirstCharUppercase = terminalDicNode->isFirstCharUppercase();
-    // Heuristic: We exclude probability=0 first-char-uppercase words from exact match.
-    // (e.g. "AMD" and "and")
-    const bool isSafeExactMatch = isExactMatch
-            && !(wordAttributes.isPossiblyOffensive() && isFirstCharUppercase);
     const int outputTypeFlags =
             (wordAttributes.isPossiblyOffensive() ? Dictionary::KIND_FLAG_POSSIBLY_OFFENSIVE : 0)
-            | ((isSafeExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
+            | ((isExactMatch && boostExactMatches) ? Dictionary::KIND_FLAG_EXACT_MATCH : 0)
             | (isExactMatchWithIntentionalOmission ?
                     Dictionary::KIND_FLAG_EXACT_MATCH_WITH_INTENTIONAL_OMISSION : 0);
-
     // Entries that are blacklisted or do not represent a word should not be output.
     const bool isValidWord = !(wordAttributes.isBlacklisted() || wordAttributes.isNotAWord());
-    // When we have to block offensive words, non-exact matched offensive words should not be
-    // output.
-    const bool blockOffensiveWords = traverseSession->getSuggestOptions()->blockOffensiveWords();
-    const bool isBlockedOffensiveWord = blockOffensiveWords && wordAttributes.isPossiblyOffensive()
-            && !isSafeExactMatch;
+
+    const bool shouldBlockThisWord = shouldBlockWord(traverseSession->getSuggestOptions(),
+            terminalDicNode, wordAttributes, true /* isLastWord */);
 
     // Increase output score of top typing suggestion to ensure autocorrection.
     // TODO: Better integration with java side autocorrection logic.
@@ -127,7 +165,7 @@ const int SuggestionsOutputUtils::MIN_LEN_FOR_MULTI_WORD_AUTOCORRECT = 16;
 
     // Don't output invalid or blocked offensive words. However, we still need to submit their
     // shortcuts if any.
-    if (isValidWord && !isBlockedOffensiveWord) {
+    if (isValidWord && !shouldBlockThisWord) {
         int codePoints[MAX_WORD_LENGTH];
         terminalDicNode->outputResult(codePoints);
         const int indexToPartialCommit = outputSecondWordFirstLetterInputIndex ?
diff --git a/native/jni/src/suggest/core/result/suggestions_output_utils.h b/native/jni/src/suggest/core/result/suggestions_output_utils.h
index bf8497828..eca1f78b2 100644
--- a/native/jni/src/suggest/core/result/suggestions_output_utils.h
+++ b/native/jni/src/suggest/core/result/suggestions_output_utils.h
@@ -18,6 +18,7 @@
 #define LATINIME_SUGGESTIONS_OUTPUT_UTILS
 
 #include "defines.h"
+#include "suggest/core/dictionary/word_attributes.h"
 
 namespace latinime {
 
@@ -25,11 +26,19 @@ class BinaryDictionaryShortcutIterator;
 class DicNode;
 class DicTraverseSession;
 class Scoring;
+class SuggestOptions;
 class SuggestionResults;
 
 class SuggestionsOutputUtils {
  public:
     /**
+     * Returns true if we should block the incoming word, in the context of the user's
+     * preferences to include or not include possibly offensive words
+     */
+    static bool shouldBlockWord(const SuggestOptions *const suggestOptions,
+            const DicNode *const terminalDicNode, const WordAttributes wordAttributes,
+            const bool isLastWord);
+    /**
      * Outputs the final list of suggestions (i.e., terminal nodes).
      */
     static void outputSuggestions(const Scoring *const scoringPolicy,
diff --git a/native/jni/src/suggest/core/suggest.cpp b/native/jni/src/suggest/core/suggest.cpp
index 68a36454e..c372d668b 100644
--- a/native/jni/src/suggest/core/suggest.cpp
+++ b/native/jni/src/suggest/core/suggest.cpp
@@ -416,6 +416,11 @@ void Suggest::createNextWordDicNode(DicTraverseSession *traverseSession, DicNode
             traverseSession->getDictionaryStructurePolicy()->getWordAttributesInContext(
                     dicNode->getPrevWordIds(), dicNode->getWordId(),
                     traverseSession->getMultiBigramMap());
+    if (SuggestionsOutputUtils::shouldBlockWord(traverseSession->getSuggestOptions(),
+            dicNode, wordAttributes, false /* isLastWord */)) {
+        return;
+    }
+
     if (!TRAVERSAL->isGoodToTraverseNextWord(dicNode, wordAttributes.getProbability())) {
         return;
     }
author	Adrian Velicu <adrianv@google.com>	2014-10-31 07:31:25 +0000
committer	Android (Google) Code Review <android-gerrit@google.com>	2014-10-31 07:31:25 +0000
commit	434c08a79ec60347590e26725372834678a8aa84 (patch)
tree	584af8737119792abfe4d8e2cff34dea9d7d8d17
parent	ed575b62da197e81ace23eeb1af09fd776fa0b41 (diff)
parent	10416241f7badaedfbafd9858deda9dca496bd08 (diff)
download	latinime-434c08a79ec60347590e26725372834678a8aa84.tar.gz latinime-434c08a79ec60347590e26725372834678a8aa84.tar.xz latinime-434c08a79ec60347590e26725372834678a8aa84.zip