aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--native/src/correction.cpp96
-rw-r--r--native/src/correction.h2
-rw-r--r--native/src/defines.h6
-rw-r--r--native/src/unigram_dictionary.cpp92
-rw-r--r--native/src/unigram_dictionary.h4
-rw-r--r--native/src/words_priority_queue.h5
6 files changed, 198 insertions, 7 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 5dc6f8737..6a129d4e3 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -83,7 +83,7 @@ inline static void calcEditDistanceOneStep(int *editDistanceTable, const unsigne
inline static int getCurrentEditDistance(
int *editDistanceTable, const int inputLength, const int outputLength) {
- if (DEBUG_DICT) {
+ if (DEBUG_EDIT_DISTANCE) {
AKLOGI("getCurrentEditDistance %d, %d", inputLength, outputLength);
}
return editDistanceTable[(inputLength + 1) * (outputLength + 1) - 1];
@@ -935,6 +935,100 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
return totalFreq;
}
+/* static */
+int Correction::RankingAlgorithm::calcFreqForSplitTwoWordsOld(
+ const int firstFreq, const int secondFreq, const Correction* correction,
+ const unsigned short *word) {
+ const int spaceProximityPos = correction->mSpaceProximityPos;
+ const int missingSpacePos = correction->mMissingSpacePos;
+ if (DEBUG_DICT) {
+ int inputCount = 0;
+ if (spaceProximityPos >= 0) ++inputCount;
+ if (missingSpacePos >= 0) ++inputCount;
+ assert(inputCount <= 1);
+ }
+ const bool isSpaceProximity = spaceProximityPos >= 0;
+ const int inputLength = correction->mInputLength;
+ const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
+ const int secondWordLength = isSpaceProximity ? (inputLength - spaceProximityPos - 1)
+ : (inputLength - missingSpacePos);
+ const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
+
+ bool firstCapitalizedWordDemotion = false;
+ if (firstWordLength >= 2) {
+ firstCapitalizedWordDemotion = isUpperCase(word[0]);
+ }
+
+ bool secondCapitalizedWordDemotion = false;
+ if (secondWordLength >= 2) {
+ secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+ }
+
+ const bool capitalizedWordDemotion =
+ firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
+
+ if (DEBUG_DICT_FULL) {
+ AKLOGI("Two words: %c, %c, %d",
+ word[0], word[firstWordLength + 1], capitalizedWordDemotion);
+ }
+
+ if (firstWordLength == 0 || secondWordLength == 0) {
+ return 0;
+ }
+ const int firstDemotionRate = 100 - 100 / (firstWordLength + 1);
+ int tempFirstFreq = firstFreq;
+ multiplyRate(firstDemotionRate, &tempFirstFreq);
+
+ const int secondDemotionRate = 100 - 100 / (secondWordLength + 1);
+ int tempSecondFreq = secondFreq;
+ multiplyRate(secondDemotionRate, &tempSecondFreq);
+
+ const int totalLength = firstWordLength + secondWordLength;
+
+ // Promote pairFreq with multiplying by 2, because the word length is the same as the typed
+ // length.
+ int totalFreq = tempFirstFreq + tempSecondFreq;
+
+ // This is a workaround to try offsetting the not-enough-demotion which will be done in
+ // calcNormalizedScore in Utils.java.
+ // In calcNormalizedScore the score will be demoted by (1 - 1 / length)
+ // but we demoted only (1 - 1 / (length + 1)) so we will additionally adjust freq by
+ // (1 - 1 / length) / (1 - 1 / (length + 1)) = (1 - 1 / (length * length))
+ const int normalizedScoreNotEnoughDemotionAdjustment = 100 - 100 / (totalLength * totalLength);
+ multiplyRate(normalizedScoreNotEnoughDemotionAdjustment, &totalFreq);
+
+ // At this moment, totalFreq is calculated by the following formula:
+ // (firstFreq * (1 - 1 / (firstWordLength + 1)) + secondFreq * (1 - 1 / (secondWordLength + 1)))
+ // * (1 - 1 / totalLength) / (1 - 1 / (totalLength + 1))
+
+ multiplyIntCapped(powerIntCapped(typedLetterMultiplier, totalLength), &totalFreq);
+
+ // This is another workaround to offset the demotion which will be done in
+ // calcNormalizedScore in Utils.java.
+ // In calcNormalizedScore the score will be demoted by (1 - 1 / length) so we have to promote
+ // the same amount because we already have adjusted the synthetic freq of this "missing or
+ // mistyped space" suggestion candidate above in this method.
+ const int normalizedScoreDemotionRateOffset = (100 + 100 / totalLength);
+ multiplyRate(normalizedScoreDemotionRateOffset, &totalFreq);
+
+ if (isSpaceProximity) {
+ // A word pair with one space proximity correction
+ if (DEBUG_DICT) {
+ AKLOGI("Found a word pair with space proximity correction.");
+ }
+ multiplyIntCapped(typedLetterMultiplier, &totalFreq);
+ multiplyRate(WORDS_WITH_PROXIMITY_CHARACTER_DEMOTION_RATE, &totalFreq);
+ }
+
+ multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &totalFreq);
+
+ if (capitalizedWordDemotion) {
+ multiplyRate(TWO_WORDS_CAPITALIZED_DEMOTION_RATE, &totalFreq);
+ }
+
+ return totalFreq;
+}
+
/* Damerau-Levenshtein distance */
inline static int editDistanceInternal(
int* editDistanceTable, const unsigned short* before,
diff --git a/native/src/correction.h b/native/src/correction.h
index a0fd55fd9..22a424f5c 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -100,6 +100,8 @@ class Correction {
const int freq, int *editDistanceTable, const Correction* correction);
static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq,
const Correction* correction, const unsigned short *word);
+ static int calcFreqForSplitTwoWordsOld(const int firstFreq, const int secondFreq,
+ const Correction* correction, const unsigned short *word);
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
const unsigned short* after, const int afterLength, const int score);
static int editDistance(const unsigned short* before,
diff --git a/native/src/defines.h b/native/src/defines.h
index 31175c369..d739043a4 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -117,8 +117,8 @@ static void prof_out(void) {
#define DEBUG_TRACE DEBUG_DICT_FULL
#define DEBUG_PROXIMITY_INFO false
#define DEBUG_CORRECTION false
-#define DEBUG_CORRECTION_FREQ true
-#define DEBUG_WORDS_PRIORITY_QUEUE true
+#define DEBUG_CORRECTION_FREQ false
+#define DEBUG_WORDS_PRIORITY_QUEUE false
#else // FLAG_DBG
@@ -213,6 +213,8 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10
+#define TWO_WORDS_CORRECTION_THRESHOLD 0.22f
+
#define MAX_DEPTH_MULTIPLIER 3
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 69e3200fc..8be95bc40 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -241,8 +241,24 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
}
}
PROF_END(6);
- if (DEBUG_WORDS_PRIORITY_QUEUE) {
+ if (DEBUG_DICT) {
queuePool->dumpSubQueue1TopSuggestions();
+ for (int i = 0; i < SUB_QUEUE_MAX_COUNT; ++i) {
+ WordsPriorityQueue* queue = queuePool->getSubQueue1(i);
+ if (queue->size() > 0) {
+ WordsPriorityQueue::SuggestedWord* sw = queue->top();
+ const int score = sw->mScore;
+ const unsigned short* word = sw->mWord;
+ const int wordLength = sw->mWordLength;
+ double ns = Correction::RankingAlgorithm::calcNormalizedScore(
+ proximityInfo->getPrimaryInputWord(), i, word, wordLength, score);
+ ns += 0;
+ AKLOGI("--- TOP SUB WORDS for %d --- %d %f [%d]", i, score, ns,
+ (ns > TWO_WORDS_CORRECTION_THRESHOLD));
+ DUMP_WORD(proximityInfo->getPrimaryInputWord(), i);
+ DUMP_WORD(word, wordLength);
+ }
+ }
}
}
@@ -441,6 +457,80 @@ void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo
return;
}
+void UnigramDictionary::getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength, const int missingSpacePos,
+ const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool) {
+ WordsPriorityQueue *masterQueue = queuePool->getMasterQueue();
+
+ if (DEBUG_DICT) {
+ int inputCount = 0;
+ if (spaceProximityPos >= 0) ++inputCount;
+ if (missingSpacePos >= 0) ++inputCount;
+ assert(inputCount <= 1);
+ }
+ const bool isSpaceProximity = spaceProximityPos >= 0;
+ const int firstWordStartPos = 0;
+ const int secondWordStartPos = isSpaceProximity ? (spaceProximityPos + 1) : missingSpacePos;
+ const int firstWordLength = isSpaceProximity ? spaceProximityPos : missingSpacePos;
+ const int secondWordLength = isSpaceProximity
+ ? (inputLength - spaceProximityPos - 1)
+ : (inputLength - missingSpacePos);
+
+ if (inputLength >= MAX_WORD_LENGTH) return;
+ if (0 >= firstWordLength || 0 >= secondWordLength || firstWordStartPos >= secondWordStartPos
+ || firstWordStartPos < 0 || secondWordStartPos + secondWordLength > inputLength)
+ return;
+
+ const int newWordLength = firstWordLength + secondWordLength + 1;
+
+
+ // Space proximity preparation
+ //WordsPriorityQueue *subQueue = queuePool->getSubQueue1();
+ //initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, firstWordLength, subQueue,
+ //correction);
+ //getSuggestionCandidates(useFullEditDistance, firstWordLength, correction, subQueue, false,
+ //MAX_ERRORS_FOR_TWO_WORDS);
+
+ // Allocating variable length array on stack
+ unsigned short word[newWordLength];
+ const int firstFreq = getMostFrequentWordLike(
+ firstWordStartPos, firstWordLength, proximityInfo, mWord);
+ if (DEBUG_DICT) {
+ AKLOGI("First freq: %d", firstFreq);
+ }
+ if (firstFreq <= 0) return;
+
+ for (int i = 0; i < firstWordLength; ++i) {
+ word[i] = mWord[i];
+ }
+
+ const int secondFreq = getMostFrequentWordLike(
+ secondWordStartPos, secondWordLength, proximityInfo, mWord);
+ if (DEBUG_DICT) {
+ AKLOGI("Second freq: %d", secondFreq);
+ }
+ if (secondFreq <= 0) return;
+
+ word[firstWordLength] = SPACE;
+ for (int i = (firstWordLength + 1); i < newWordLength; ++i) {
+ word[i] = mWord[i - firstWordLength - 1];
+ }
+
+ // TODO: Remove initSuggestions and correction->setCorrectionParams
+ initSuggestions(proximityInfo, xcoordinates, ycoordinates, codes, inputLength, correction);
+
+ correction->setCorrectionParams(-1 /* skipPos */, -1 /* excessivePos */,
+ -1 /* transposedPos */, spaceProximityPos, missingSpacePos,
+ useFullEditDistance, false /* doAutoCompletion */, MAX_ERRORS_FOR_TWO_WORDS);
+ const int pairFreq = correction->getFreqForSplitTwoWords(firstFreq, secondFreq, word);
+ if (DEBUG_DICT) {
+ AKLOGI("Split two words: %d, %d, %d, %d", firstFreq, secondFreq, pairFreq, inputLength);
+ }
+ addWord(word, newWordLength, pairFreq, masterQueue);
+ return;
+}
+
// Wrapper for getMostFrequentWordLikeInner, which matches it to the previous
// interface.
inline int UnigramDictionary::getMostFrequentWordLike(const int startInputIndex,
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 5e7a7580f..b950971bb 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -104,6 +104,10 @@ class UnigramDictionary {
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
+ void getSplitTwoWordsSuggestionsOld(ProximityInfo *proximityInfo,
+ const int *xcoordinates, const int *ycoordinates, const int *codes,
+ const bool useFullEditDistance, const int inputLength, const int spaceProximityPos,
+ const int missingSpacePos, Correction *correction, WordsPriorityQueuePool* queuePool);
void getMissingSpaceWords(ProximityInfo *proximityInfo, const int *xcoordinates,
const int *ycoordinates, const int *codes, const bool useFullEditDistance,
const int inputLength, const int missingSpacePos, Correction *correction,
diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h
index 54bf27a59..6262439b5 100644
--- a/native/src/words_priority_queue.h
+++ b/native/src/words_priority_queue.h
@@ -81,10 +81,9 @@ class WordsPriorityQueue {
mSuggestions.push(sw);
}
- SuggestedWord* topAndPop() {
+ SuggestedWord* top() {
if (mSuggestions.empty()) return 0;
SuggestedWord* sw = mSuggestions.top();
- mSuggestions.pop();
return sw;
}
@@ -112,7 +111,7 @@ class WordsPriorityQueue {
return size;
}
- int size() {
+ int size() const {
return mSuggestions.size();
}