Use translation of fallback umlauts digraphs for German.

For German : handle "ae", "oe" and "ue" to be alternate forms for umlaut-bearing versions of "a", "o" and "u". Issue: 3275926 Change-Id: I056c707cdacc464ceab63be56c016c7f8439196c
author: Jean Chalard <jchalard@google.com> 2011-02-25 17:56:53 +0900
committer: Jean Chalard <jchalard@google.com> 2011-03-03 11:52:23 +0900
commit: c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4 (patch)
tree: 11a87dc05035a76aa4e9f95095cbbf8b67ea3e20 /native/src/unigram_dictionary.cpp
parent: e59491460b0411bed430a5ca6eca0c56c5bf18d9 (diff)
download: latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.tar.gz
latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.tar.xz
latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.zip
1 files changed, 125 insertions, 29 deletions
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 72b0f361a..9aa36b064 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -29,20 +29,136 @@
 
 namespace latinime {
 
+const UnigramDictionary::digraph_t UnigramDictionary::GERMAN_UMLAUT_DIGRAPHS[] =
+        { { 'a', 'e' },
+        { 'o', 'e' },
+        { 'u', 'e' } };
+
 UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterMultiplier,
         int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
         const bool isLatestDictVersion)
     : DICT(dict), MAX_WORD_LENGTH(maxWordLength), MAX_WORDS(maxWords),
     MAX_PROXIMITY_CHARS(maxProximityChars), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     TYPED_LETTER_MULTIPLIER(typedLetterMultiplier), FULL_WORD_MULTIPLIER(fullWordMultiplier),
-    ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0) {
+    ROOT_POS(isLatestDictVersion ? DICTIONARY_HEADER_SIZE : 0),
+    BYTES_IN_ONE_CHAR(MAX_PROXIMITY_CHARS * sizeof(*mInputCodes)) {
     if (DEBUG_DICT) LOGI("UnigramDictionary - constructor");
 }
 
 UnigramDictionary::~UnigramDictionary() {}
 
-int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordinates,
-        int *ycoordinates, int *codes, int codesSize, unsigned short *outWords, int *frequencies) {
+static inline unsigned int getCodesBufferSize(const int* codes, const int codesSize,
+        const int MAX_PROXIMITY_CHARS) {
+    return sizeof(*codes) * MAX_PROXIMITY_CHARS * codesSize;
+}
+
+bool UnigramDictionary::isDigraph(const int* codes, const int i, const int codesSize) const {
+
+    // There can't be a digraph if we don't have at least 2 characters to examine
+    if (i + 2 > codesSize) return false;
+
+    // Search for the first char of some digraph
+    int lastDigraphIndex = -1;
+    const int thisChar = codes[i * MAX_PROXIMITY_CHARS];
+    for (lastDigraphIndex = sizeof(GERMAN_UMLAUT_DIGRAPHS) / sizeof(GERMAN_UMLAUT_DIGRAPHS[0]) - 1;
+            lastDigraphIndex >= 0; --lastDigraphIndex) {
+        if (thisChar == GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].first) break;
+    }
+    // No match: return early
+    if (lastDigraphIndex < 0) return false;
+
+    // It's an interesting digraph if the second char matches too.
+    return GERMAN_UMLAUT_DIGRAPHS[lastDigraphIndex].second == codes[(i + 1) * MAX_PROXIMITY_CHARS];
+}
+
+// Mostly the same arguments as the non-recursive version, except:
+// codes is the original value. It points to the start of the work buffer, and gets passed as is.
+// codesSize is the size of the user input (thus, it is the size of codesSrc).
+// codesDest is the current point in the work buffer.
+// codesSrc is the current point in the user-input, original, content-unmodified buffer.
+// codesRemain is the remaining size in codesSrc.
+void UnigramDictionary::getWordWithDigraphSuggestionsRec(const ProximityInfo *proximityInfo,
+        const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
+        const int codesBufferSize, const int flags, const int* codesSrc, const int codesRemain,
+        int* codesDest, unsigned short* outWords, int* frequencies) {
+
+    for (int i = 0; i < codesRemain; ++i) {
+        if (isDigraph(codesSrc, i, codesRemain)) {
+            // Found a digraph. We will try both spellings. eg. the word is "pruefen"
+
+            // Copy the word up to the first char of the digraph, then continue processing
+            // on the remaining part of the word, skipping the second char of the digraph.
+            // In our example, copy "pru" and continue running on "fen"
+            memcpy(codesDest, codesSrc, i * BYTES_IN_ONE_CHAR);
+            getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+                    codesBufferSize, flags, codesSrc + (i + 1) * MAX_PROXIMITY_CHARS,
+                    codesRemain - i - 1, codesDest + i * MAX_PROXIMITY_CHARS,
+                    outWords, frequencies);
+
+            // Copy the second char of the digraph in place, then continue processing on
+            // the remaining part of the word.
+            // In our example, after "pru" in the buffer copy the "e", and continue running on "fen"
+            memcpy(codesDest + i * MAX_PROXIMITY_CHARS, codesSrc + i * MAX_PROXIMITY_CHARS,
+                    BYTES_IN_ONE_CHAR);
+            getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+                    codesBufferSize, flags, codesSrc + i * MAX_PROXIMITY_CHARS, codesRemain - i,
+                    codesDest + i * MAX_PROXIMITY_CHARS, outWords, frequencies);
+            return;
+        }
+    }
+
+    // If we come here, we hit the end of the word: let's check it against the dictionary.
+    // In our example, we'll come here once for "prufen" and then once for "pruefen".
+    // If the word contains several digraphs, we'll come it for the product of them.
+    // eg. if the word is "ueberpruefen" we'll test, in order, against
+    // "uberprufen", "uberpruefen", "ueberprufen", "ueberpruefen".
+    const unsigned int remainingBytes = BYTES_IN_ONE_CHAR * codesRemain;
+    if (0 != remainingBytes)
+        memcpy(codesDest, codesSrc, remainingBytes);
+
+    getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+            (codesDest - codesBuffer) / MAX_PROXIMITY_CHARS + codesRemain, outWords, frequencies);
+}
+
+int UnigramDictionary::getSuggestions(const ProximityInfo *proximityInfo, const int *xcoordinates,
+        const int *ycoordinates, const int *codes, const int codesSize, const int flags,
+        unsigned short *outWords, int *frequencies) {
+
+    if (REQUIRES_GERMAN_UMLAUT_PROCESSING & flags)
+    { // Incrementally tune the word and try all possibilities
+        int codesBuffer[getCodesBufferSize(codes, codesSize, MAX_PROXIMITY_CHARS)];
+        getWordWithDigraphSuggestionsRec(proximityInfo, xcoordinates, ycoordinates, codesBuffer,
+                codesSize, flags, codes, codesSize, codesBuffer, outWords, frequencies);
+    } else { // Normal processing
+        getWordSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, codesSize,
+                outWords, frequencies);
+    }
+
+    PROF_START(6);
+    // Get the word count
+    int suggestedWordsCount = 0;
+    while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
+        suggestedWordsCount++;
+    }
+
+    if (DEBUG_DICT) {
+        LOGI("Returning %d words", suggestedWordsCount);
+        LOGI("Next letters: ");
+        for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
+            if (mNextLettersFrequency[k] > 0) {
+                LOGI("%c = %d,", k, mNextLettersFrequency[k]);
+            }
+        }
+    }
+    PROF_END(6);
+    PROF_CLOSE;
+    return suggestedWordsCount;
+}
+
+void UnigramDictionary::getWordSuggestions(const ProximityInfo *proximityInfo,
+        const int *xcoordinates, const int *ycoordinates, const int *codes, const int codesSize,
+        unsigned short *outWords, int *frequencies) {
+
     PROF_OPEN;
     PROF_START(0);
     initSuggestions(codes, codesSize, outWords, frequencies);
@@ -103,30 +219,10 @@ int UnigramDictionary::getSuggestions(ProximityInfo *proximityInfo, int *xcoordi
         }
     }
     PROF_END(5);
-
-    PROF_START(6);
-    // Get the word count
-    int suggestedWordsCount = 0;
-    while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
-        suggestedWordsCount++;
-    }
-
-    if (DEBUG_DICT) {
-        LOGI("Returning %d words", suggestedWordsCount);
-        LOGI("Next letters: ");
-        for (int k = 0; k < NEXT_LETTERS_SIZE; k++) {
-            if (mNextLettersFrequency[k] > 0) {
-                LOGI("%c = %d,", k, mNextLettersFrequency[k]);
-            }
-        }
-    }
-    PROF_END(6);
-    PROF_CLOSE;
-    return suggestedWordsCount;
 }
 
-void UnigramDictionary::initSuggestions(int *codes, int codesSize, unsigned short *outWords,
-        int *frequencies) {
+void UnigramDictionary::initSuggestions(const int *codes, const int codesSize,
+        unsigned short *outWords, int *frequencies) {
     if (DEBUG_DICT) LOGI("initSuggest");
     mFrequencies = frequencies;
     mOutputChars = outWords;
@@ -204,7 +300,7 @@ bool UnigramDictionary::sameAsTyped(unsigned short *word, int length) {
     if (length != mInputLength) {
         return false;
     }
-    int *inputCodes = mInputCodes;
+    const int *inputCodes = mInputCodes;
     while (length--) {
         if ((unsigned int) *inputCodes != (unsigned int) *word) {
             return false;
@@ -423,7 +519,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
     const int currentChar = *getInputCharsAt(inputIndex);
     const int leftIndex = inputIndex - 1;
     if (leftIndex >= 0) {
-        int *leftChars = getInputCharsAt(leftIndex);
+        const int *leftChars = getInputCharsAt(leftIndex);
         int i = 0;
         while (leftChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
             if (leftChars[i++] == currentChar) return true;
@@ -431,7 +527,7 @@ inline bool UnigramDictionary::existsAdjacentProximityChars(const int inputIndex
     }
     const int rightIndex = inputIndex + 1;
     if (rightIndex < inputLength) {
-        int *rightChars = getInputCharsAt(rightIndex);
+        const int *rightChars = getInputCharsAt(rightIndex);
         int i = 0;
         while (rightChars[i] > 0 && i < MAX_PROXIMITY_CHARS) {
             if (rightChars[i++] == currentChar) return true;
@@ -523,7 +619,7 @@ inline bool UnigramDictionary::processCurrentNode(const int pos, const int depth
         *newDiffs = diffs;
         *newInputIndex = inputIndex;
     } else {
-        int *currentChars = getInputCharsAt(inputIndex);
+        const int *currentChars = getInputCharsAt(inputIndex);
 
         if (transposedPos >= 0) {
             if (inputIndex == transposedPos) currentChars += MAX_PROXIMITY_CHARS;
author	Jean Chalard <jchalard@google.com>	2011-02-25 17:56:53 +0900
committer	Jean Chalard <jchalard@google.com>	2011-03-03 11:52:23 +0900
commit	c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4 (patch)
tree	11a87dc05035a76aa4e9f95095cbbf8b67ea3e20 /native/src/unigram_dictionary.cpp
parent	e59491460b0411bed430a5ca6eca0c56c5bf18d9 (diff)
download	latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.tar.gz latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.tar.xz latinime-c2bbc6a4499a6da979381fa0e8e6e855a5ac6aa4.zip