aboutsummaryrefslogtreecommitdiffstats
path: root/native
diff options
context:
space:
mode:
Diffstat (limited to 'native')
-rw-r--r--native/src/correction.cpp104
-rw-r--r--native/src/correction.h10
-rw-r--r--native/src/defines.h8
-rw-r--r--native/src/unigram_dictionary.cpp65
-rw-r--r--native/src/unigram_dictionary.h2
5 files changed, 125 insertions, 64 deletions
diff --git a/native/src/correction.cpp b/native/src/correction.cpp
index 5f11452ae..087219ed4 100644
--- a/native/src/correction.cpp
+++ b/native/src/correction.cpp
@@ -159,10 +159,10 @@ void Correction::checkState() {
}
}
-int Correction::getFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
- const bool isSpaceProximity, const unsigned short *word) {
- return Correction::RankingAlgorithm::calcFreqForSplitTwoWords(freqArray, wordLengthArray, this,
- isSpaceProximity, word);
+int Correction::getFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+ const int wordCount, const bool isSpaceProximity, const unsigned short *word) {
+ return Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(freqArray, wordLengthArray,
+ wordCount, this, isSpaceProximity, word);
}
int Correction::getFinalFreq(const int freq, unsigned short **word, int *wordLength) {
@@ -911,45 +911,85 @@ int Correction::RankingAlgorithm::calculateFinalFreq(const int inputIndex, const
}
/* static */
-int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
- const int *freqArray, const int *wordLengthArray, const Correction* correction,
- const bool isSpaceProximity, const unsigned short *word) {
- const int firstFreq = freqArray[0];
- const int secondFreq = freqArray[1];
- const int firstWordLength = wordLengthArray[0];
- const int secondWordLength = wordLengthArray[1];
+int Correction::RankingAlgorithm::calcFreqForSplitMultipleWords(
+ const int *freqArray, const int *wordLengthArray, const int wordCount,
+ const Correction* correction, const bool isSpaceProximity, const unsigned short *word) {
const int typedLetterMultiplier = correction->TYPED_LETTER_MULTIPLIER;
bool firstCapitalizedWordDemotion = false;
- if (firstWordLength >= 2) {
- firstCapitalizedWordDemotion = isUpperCase(word[0]);
- }
-
bool secondCapitalizedWordDemotion = false;
- if (secondWordLength >= 2) {
- secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+
+ {
+ // TODO: Handle multiple capitalized word demotion properly
+ const int firstWordLength = wordLengthArray[0];
+ const int secondWordLength = wordLengthArray[1];
+ if (firstWordLength >= 2) {
+ firstCapitalizedWordDemotion = isUpperCase(word[0]);
+ }
+
+ if (secondWordLength >= 2) {
+ // FIXME: word[firstWordLength + 1] is incorrect.
+ secondCapitalizedWordDemotion = isUpperCase(word[firstWordLength + 1]);
+ }
}
+
const bool capitalizedWordDemotion =
firstCapitalizedWordDemotion ^ secondCapitalizedWordDemotion;
- if (firstWordLength == 0 || secondWordLength == 0) {
- return 0;
+ int totalLength = 0;
+ int totalFreq = 0;
+ for (int i = 0; i < wordCount; ++i){
+ const int wordLength = wordLengthArray[i];
+ if (wordLength <= 0) {
+ return 0;
+ }
+ totalLength += wordLength;
+ const int demotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (wordLength + 1);
+ int tempFirstFreq = freqArray[i];
+ multiplyRate(demotionRate, &tempFirstFreq);
+ totalFreq += tempFirstFreq;
}
- const int firstDemotionRate = 100 - TWO_WORDS_CORRECTION_DEMOTION_BASE / (firstWordLength + 1);
- int tempFirstFreq = firstFreq;
- multiplyRate(firstDemotionRate, &tempFirstFreq);
-
- const int secondDemotionRate = 100
- - TWO_WORDS_CORRECTION_DEMOTION_BASE / (secondWordLength + 1);
- int tempSecondFreq = secondFreq;
- multiplyRate(secondDemotionRate, &tempSecondFreq);
- const int totalLength = firstWordLength + secondWordLength;
+ if (totalLength <= 0 || totalFreq <= 0) {
+ return 0;
+ }
+ // TODO: Currently totalFreq is adjusted to two word metrix.
// Promote pairFreq with multiplying by 2, because the word length is the same as the typed
// length.
- int totalFreq = tempFirstFreq + tempSecondFreq;
+ totalFreq = totalFreq * 2 / wordCount;
+ if (wordCount > 2) {
+ // Safety net for 3+ words -- Caveats: many heuristics and workarounds here.
+ int oneLengthCounter = 0;
+ int twoLengthCounter = 0;
+ for (int i = 0; i < wordCount; ++i) {
+ const int wordLength = wordLengthArray[i];
+ // TODO: Use bigram instead of this safety net
+ if (i < wordCount - 1) {
+ const int nextWordLength = wordLengthArray[i + 1];
+ if (wordLength == 1 && nextWordLength == 2) {
+ // Safety net to filter 1 length and 2 length sequential words
+ return 0;
+ }
+ }
+ const int freq = freqArray[i];
+ // Demote too short weak words
+ if (wordLength <= 4 && freq <= MAX_FREQ * 2 / 3 /* heuristic... */) {
+ multiplyRate(100 * freq / MAX_FREQ, &totalFreq);
+ }
+ if (wordLength == 1) {
+ ++oneLengthCounter;
+ } else if (wordLength == 2) {
+ ++twoLengthCounter;
+ }
+ if (oneLengthCounter >= 2 || (oneLengthCounter + twoLengthCounter) >= 4) {
+ // Safety net to filter too many short words
+ return 0;
+ }
+ }
+ multiplyRate(MULTIPLE_WORDS_DEMOTION_RATE, &totalFreq);
+ }
// This is a workaround to try offsetting the not-enough-demotion which will be done in
// calcNormalizedScore in Utils.java.
@@ -993,9 +1033,9 @@ int Correction::RankingAlgorithm::calcFreqForSplitTwoWords(
}
if (DEBUG_CORRECTION_FREQ) {
- AKLOGI("Two words (%d, %d) (%d, %d) %d, %d", firstFreq, secondFreq, firstWordLength,
- secondWordLength, capitalizedWordDemotion, totalFreq);
- DUMP_WORD(word, firstWordLength);
+ AKLOGI("Multiple words (%d, %d) (%d, %d) %d, %d", freqArray[0], freqArray[1],
+ wordLengthArray[0], wordLengthArray[1], capitalizedWordDemotion, totalFreq);
+ DUMP_WORD(word, wordLengthArray[0]);
}
return totalFreq;
diff --git a/native/src/correction.h b/native/src/correction.h
index 9559bbf12..2114eff4b 100644
--- a/native/src/correction.h
+++ b/native/src/correction.h
@@ -121,9 +121,9 @@ class Correction {
bool needsToPrune() const;
- int getFreqForSplitTwoWords(
- const int *freqArray, const int *wordLengthArray, const bool isSpaceProximity,
- const unsigned short *word);
+ int getFreqForSplitMultipleWords(
+ const int *freqArray, const int *wordLengthArray, const int wordCount,
+ const bool isSpaceProximity, const unsigned short *word);
int getFinalFreq(const int freq, unsigned short **word, int* wordLength);
int getFinalFreqForSubQueue(const int freq, unsigned short **word, int* wordLength,
const int inputLength);
@@ -151,8 +151,8 @@ class Correction {
static int calculateFinalFreq(const int inputIndex, const int depth,
const int freq, int *editDistanceTable, const Correction* correction,
const int inputLength);
- static int calcFreqForSplitTwoWords(const int *freqArray, const int *wordLengthArray,
- const Correction* correction, const bool isSpaceProximity,
+ static int calcFreqForSplitMultipleWords(const int *freqArray, const int *wordLengthArray,
+ const int wordCount, const Correction* correction, const bool isSpaceProximity,
const unsigned short *word);
static double calcNormalizedScore(const unsigned short* before, const int beforeLength,
const unsigned short* after, const int afterLength, const int score);
diff --git a/native/src/defines.h b/native/src/defines.h
index 5b5c54850..ffadb11c5 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -208,6 +208,7 @@ static void prof_out(void) {
#define ZERO_DISTANCE_PROMOTION_RATE 110
#define NEUTRAL_SCORE_SQUARED_RADIUS 8.0f
#define HALF_SCORE_SQUARED_RADIUS 32.0f
+#define MAX_FREQ 255
// This must be greater than or equal to MAX_WORD_LENGTH defined in BinaryDictionary.java
// This is only used for the size of array. Not to be used in c functions.
@@ -222,7 +223,9 @@ static void prof_out(void) {
#define SUB_QUEUE_MAX_WORDS 1
#define SUB_QUEUE_MAX_COUNT 10
#define SUB_QUEUE_MIN_WORD_LENGTH 4
-#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 2
+#define MULTIPLE_WORDS_SUGGESTION_MAX_WORDS 10
+#define MULTIPLE_WORDS_DEMOTION_RATE 80
+#define MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION 6
#define TWO_WORDS_CORRECTION_WITH_OTHER_ERROR_THRESHOLD 0.39
#define START_TWO_WORDS_CORRECTION_THRESHOLD 0.22
@@ -230,7 +233,6 @@ static void prof_out(void) {
#define MAX_DEPTH_MULTIPLIER 3
#define FIRST_WORD_INDEX 0
-#define SECOND_WORD_INDEX 1
// TODO: Reduce this constant if possible; check the maximum number of umlauts in the same German
// word in the dictionary
@@ -248,7 +250,7 @@ template<typename T> inline T max(T a, T b) { return a > b ? a : b; }
#define NEUTRAL_AREA_RADIUS_RATIO 1.3f
// DEBUG
-#define INPUTLENGTH_FOR_DEBUG 10
+#define INPUTLENGTH_FOR_DEBUG -1
#define MIN_OUTPUT_INDEX_FOR_DEBUG -1
#endif // LATINIME_DEFINES_H
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index 597e5c821..155bdcb7a 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -224,7 +224,7 @@ void UnigramDictionary::getWordSuggestions(ProximityInfo *proximityInfo,
// Multiple word suggestions
if (SUGGEST_MULTIPLE_WORDS
&& inputLength >= MIN_USER_TYPED_LENGTH_FOR_MULTIPLE_WORD_SUGGESTION) {
- getSplitTwoWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
+ getSplitMultipleWordsSuggestions(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, inputLength, correction, queuePool,
hasAutoCorrectionCandidate);
}
@@ -445,17 +445,18 @@ bool UnigramDictionary::getSubStringSuggestion(
if (outputWordStartPos + nextWordLength >= MAX_WORD_LENGTH) {
return false;
}
- outputWord[outputWordStartPos + tempOutputWordLength] = SPACE;
+ outputWord[tempOutputWordLength] = SPACE;
if (outputWordLength) {
++*outputWordLength;
}
} else if (currentWordIndex >= 1) {
// TODO: Handle 3 or more words
- const int pairFreq = correction->getFreqForSplitTwoWords(
- freqArray, wordLengthArray, isSpaceProximity, outputWord);
+ const int pairFreq = correction->getFreqForSplitMultipleWords(
+ freqArray, wordLengthArray, currentWordIndex + 1, isSpaceProximity, outputWord);
if (DEBUG_DICT) {
- AKLOGI("Split two words: %d, %d, %d, %d, (%d)", freqArray[0], freqArray[1], pairFreq,
- inputLength, wordLengthArray[0]);
+ DUMP_WORD(outputWord, tempOutputWordLength);
+ AKLOGI("Split two words: %d, %d, %d, %d, (%d) %d", freqArray[0], freqArray[1], pairFreq,
+ inputLength, wordLengthArray[0], tempOutputWordLength);
}
addWord(outputWord, tempOutputWordLength, pairFreq, queuePool->getMasterQueue());
}
@@ -473,30 +474,46 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
// Return if the last word index
return;
}
- for (int i = 1; i < inputLength; ++i) {
- int tempOutputWordLength = 0;
- // First word
- int inputWordStartPos = 0;
- int inputWordLength = i;
+ if (startWordIndex >= 1
+ && (hasAutoCorrectionCandidate
+ || inputLength < MIN_INPUT_LENGTH_FOR_THREE_OR_MORE_WORDS_CORRECTION)) {
+ // Do not suggest 3+ words if already has auto correction candidate
+ return;
+ }
+ for (int i = startInputPos + 1; i < inputLength; ++i) {
if (DEBUG_CORRECTION_FREQ) {
- AKLOGI("Two words, %d", inputWordLength);
+ AKLOGI("Multi words(%d), start in %d sep %d start out %d",
+ startWordIndex, startInputPos, i, outputWordLength);
+ DUMP_WORD(outputWord, outputWordLength);
}
+ int tempOutputWordLength = 0;
+ // Current word
+ int inputWordStartPos = startInputPos;
+ int inputWordLength = i - startInputPos;
if (!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
- FIRST_WORD_INDEX, inputWordStartPos, inputWordLength, 0, true /* not used */,
- freqArray, wordLengthArray, outputWord, &tempOutputWordLength)) {
+ startWordIndex, inputWordStartPos, inputWordLength, outputWordLength,
+ true /* not used */, freqArray, wordLengthArray, outputWord,
+ &tempOutputWordLength)) {
continue;
}
- // Second word
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Do missing space correction");
+ }
+ // Next word
// Missing space
inputWordStartPos = i;
inputWordLength = inputLength - i;
- getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
+ if(!getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
- SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
- false /* missing space */, freqArray, wordLengthArray, outputWord,
- 0);
+ startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+ false /* missing space */, freqArray, wordLengthArray, outputWord, 0)) {
+ getMultiWordsSuggestionRec(proximityInfo, xcoordinates, ycoordinates, codes,
+ useFullEditDistance, inputLength, correction, queuePool,
+ hasAutoCorrectionCandidate, inputWordStartPos, startWordIndex + 1,
+ tempOutputWordLength, freqArray, wordLengthArray, outputWord);
+ }
// Mistyped space
++inputWordStartPos;
@@ -512,15 +529,17 @@ void UnigramDictionary::getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
continue;
}
+ if (DEBUG_CORRECTION_FREQ) {
+ AKLOGI("Do mistyped space correction");
+ }
getSubStringSuggestion(proximityInfo, xcoordinates, ycoordinates, codes,
useFullEditDistance, correction, queuePool, inputLength, hasAutoCorrectionCandidate,
- SECOND_WORD_INDEX, inputWordStartPos, inputWordLength, tempOutputWordLength,
- true /* mistyped space */, freqArray, wordLengthArray, outputWord,
- 0);
+ startWordIndex + 1, inputWordStartPos, inputWordLength, tempOutputWordLength,
+ true /* mistyped space */, freqArray, wordLengthArray, outputWord, 0);
}
}
-void UnigramDictionary::getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
+void UnigramDictionary::getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 2d5d076b1..396a81149 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -101,7 +101,7 @@ class UnigramDictionary {
const bool useFullEditDistance, const int inputLength, Correction *correction,
WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
const int currentWordIndex);
- void getSplitTwoWordsSuggestions(ProximityInfo *proximityInfo,
+ void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
const int *xcoordinates, const int *ycoordinates, const int *codes,
const bool useFullEditDistance, const int inputLength,
Correction *correction, WordsPriorityQueuePool* queuePool,