diff options
Diffstat (limited to 'native/jni/src')
-rw-r--r-- | native/jni/src/bigram_dictionary.cpp | 14 | ||||
-rw-r--r-- | native/jni/src/binary_format.h | 27 | ||||
-rw-r--r-- | native/jni/src/dictionary.cpp | 4 | ||||
-rw-r--r-- | native/jni/src/dictionary.h | 2 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.cpp | 17 | ||||
-rw-r--r-- | native/jni/src/unigram_dictionary.h | 2 |
6 files changed, 47 insertions, 19 deletions
diff --git a/native/jni/src/bigram_dictionary.cpp b/native/jni/src/bigram_dictionary.cpp index ac2a26172..eb4bf8d1a 100644 --- a/native/jni/src/bigram_dictionary.cpp +++ b/native/jni/src/bigram_dictionary.cpp @@ -117,14 +117,22 @@ int BigramDictionary::getBigrams(const int32_t *prevWord, int prevWordLength, in do { bigramFlags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); uint16_t bigramBuffer[MAX_WORD_LENGTH]; + int unigramFreq; const int bigramPos = BinaryFormat::getAttributeAddressAndForwardPointer(root, bigramFlags, &pos); const int length = BinaryFormat::getWordAtAddress(root, bigramPos, MAX_WORD_LENGTH, - bigramBuffer); + bigramBuffer, &unigramFreq); // codesSize == 0 means we are trying to find bigram predictions. if (codesSize < 1 || checkFirstCharacter(bigramBuffer)) { - const int frequency = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + const int bigramFreq = UnigramDictionary::MASK_ATTRIBUTE_FREQUENCY & bigramFlags; + // Due to space constraints, the frequency for bigrams is approximate - the lower the + // unigram frequency, the worse the precision. The theoritical maximum error in + // resulting frequency is 8 - although in the practice it's never bigger than 3 or 4 + // in very bad cases. This means that sometimes, we'll see some bigrams interverted + // here, but it can't get too bad. + const int frequency = + BinaryFormat::computeFrequencyForBigram(unigramFreq, bigramFreq); if (addWordBigram(bigramBuffer, length, frequency)) { ++bigramCount; } @@ -149,8 +157,8 @@ int BigramDictionary::getBigramListPositionForWord(const int32_t *prevWord, } else { pos = BinaryFormat::skipOtherCharacters(root, pos); } - pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipFrequency(flags, pos); + pos = BinaryFormat::skipChildrenPosition(flags, pos); pos = BinaryFormat::skipShortcuts(root, flags, pos); return pos; } diff --git a/native/jni/src/binary_format.h b/native/jni/src/binary_format.h index 40f197619..51bf8ebbc 100644 --- a/native/jni/src/binary_format.h +++ b/native/jni/src/binary_format.h @@ -66,7 +66,8 @@ class BinaryFormat { static int getTerminalPosition(const uint8_t* const root, const int32_t* const inWord, const int length); static int getWordAtAddress(const uint8_t* const root, const int address, const int maxDepth, - uint16_t* outWord); + uint16_t* outWord, int* outUnigramFrequency); + static int computeFrequencyForBigram(const int unigramFreq, const int bigramFreq); static int getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq); @@ -390,10 +391,11 @@ inline int BinaryFormat::getTerminalPosition(const uint8_t* const root, * address: the byte position of the last chargroup of the word we are searching for (this is * what is stored as the "bigram address" in each bigram) * outword: an array to write the found word, with MAX_WORD_LENGTH size. + * outUnigramFrequency: a pointer to an int to write the frequency into. * Return value : the length of the word, of 0 if the word was not found. */ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int address, - const int maxDepth, uint16_t* outWord) { + const int maxDepth, uint16_t* outWord, int* outUnigramFrequency) { int pos = 0; int wordPos = 0; @@ -421,11 +423,12 @@ inline int BinaryFormat::getWordAtAddress(const uint8_t* const root, const int a // We count chars in order to avoid infinite loops if the file is broken or // if there is some other bug int charCount = maxDepth; - while (-1 != nextChar && --charCount > 0) { + while (NOT_A_CHARACTER != nextChar && --charCount > 0) { outWord[++wordPos] = nextChar; nextChar = getCharCodeAndForwardPointer(root, &pos); } } + *outUnigramFrequency = readFrequencyWithoutMovingPointer(root, pos); return ++wordPos; } // We need to skip past this char group, so skip any remaining chars after the @@ -529,6 +532,16 @@ static inline int backoff(const int unigramFreq) { // return unigramFreq > 8 ? unigramFreq - 8 : (0 == unigramFreq ? 0 : 8); } +inline int BinaryFormat::computeFrequencyForBigram(const int unigramFreq, const int bigramFreq) { + // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the + // unigram frequency to be the median value of the 17th step from the top. A value of + // 0 for the bigram frequency represents the middle of the 16th step from the top, + // while a value of 15 represents the middle of the top step. + // See makedict.BinaryDictInputOutput for details. + const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); + return (int)(unigramFreq + (bigramFreq + 1) * stepSize); +} + // This returns a probability in log space. inline int BinaryFormat::getProbability(const int position, const std::map<int, int> *bigramMap, const uint8_t *bigramFilter, const int unigramFreq) { @@ -537,13 +550,7 @@ inline int BinaryFormat::getProbability(const int position, const std::map<int, const std::map<int, int>::const_iterator bigramFreqIt = bigramMap->find(position); if (bigramFreqIt != bigramMap->end()) { const int bigramFreq = bigramFreqIt->second; - // We divide the range [unigramFreq..255] in 16.5 steps - in other words, we want the - // unigram frequency to be the median value of the 17th step from the top. A value of - // 0 for the bigram frequency represents the middle of the 16th step from the top, - // while a value of 15 represents the middle of the top step. - // See makedict.BinaryDictInputOutput for details. - const float stepSize = ((float)MAX_FREQ - unigramFreq) / (1.5f + MAX_BIGRAM_FREQ); - return (int)(unigramFreq + bigramFreq * stepSize); + return computeFrequencyForBigram(unigramFreq, bigramFreq); } else { return backoff(unigramFreq); } diff --git a/native/jni/src/dictionary.cpp b/native/jni/src/dictionary.cpp index 65d0f73a3..1fb02478b 100644 --- a/native/jni/src/dictionary.cpp +++ b/native/jni/src/dictionary.cpp @@ -55,8 +55,8 @@ Dictionary::~Dictionary() { delete mBigramDictionary; } -bool Dictionary::isValidWord(const int32_t *word, int length) { - return mUnigramDictionary->isValidWord(word, length); +int Dictionary::getFrequency(const int32_t *word, int length) { + return mUnigramDictionary->getFrequency(word, length); } bool Dictionary::isValidBigram(const int32_t *word1, int length1, const int32_t *word2, diff --git a/native/jni/src/dictionary.h b/native/jni/src/dictionary.h index 87891ee4d..9f2367904 100644 --- a/native/jni/src/dictionary.h +++ b/native/jni/src/dictionary.h @@ -52,7 +52,7 @@ class Dictionary { maxWordLength, maxBigrams); } - bool isValidWord(const int32_t *word, int length); + int getFrequency(const int32_t *word, int length); bool isValidBigram(const int32_t *word1, int length1, const int32_t *word2, int length2); void *getDict() { return (void *)mDict; } int getDictSize() { return mDictSize; } diff --git a/native/jni/src/unigram_dictionary.cpp b/native/jni/src/unigram_dictionary.cpp index 828582848..efe9c4fe3 100644 --- a/native/jni/src/unigram_dictionary.cpp +++ b/native/jni/src/unigram_dictionary.cpp @@ -747,8 +747,21 @@ int UnigramDictionary::getMostFrequentWordLikeInner(const uint16_t * const inWor return maxFreq; } -bool UnigramDictionary::isValidWord(const int32_t* const inWord, const int length) const { - return NOT_VALID_WORD != BinaryFormat::getTerminalPosition(DICT_ROOT, inWord, length); +int UnigramDictionary::getFrequency(const int32_t* const inWord, const int length) const { + const uint8_t* const root = DICT_ROOT; + int pos = BinaryFormat::getTerminalPosition(root, inWord, length); + if (NOT_VALID_WORD == pos) { + return NOT_A_PROBABILITY; + } + const uint8_t flags = BinaryFormat::getFlagsAndForwardPointer(root, &pos); + const bool hasMultipleChars = (0 != (FLAG_HAS_MULTIPLE_CHARS & flags)); + if (hasMultipleChars) { + pos = BinaryFormat::skipOtherCharacters(root, pos); + } else { + BinaryFormat::getCharCodeAndForwardPointer(DICT_ROOT, &pos); + } + const int unigramFreq = BinaryFormat::readFrequencyWithoutMovingPointer(root, pos); + return unigramFreq; } // TODO: remove this function. diff --git a/native/jni/src/unigram_dictionary.h b/native/jni/src/unigram_dictionary.h index b9233518f..b70894004 100644 --- a/native/jni/src/unigram_dictionary.h +++ b/native/jni/src/unigram_dictionary.h @@ -72,7 +72,7 @@ class UnigramDictionary { UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, const unsigned int flags); - bool isValidWord(const int32_t* const inWord, const int length) const; + int getFrequency(const int32_t* const inWord, const int length) const; int getBigramPosition(int pos, unsigned short *word, int offset, int length) const; int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool, Correction *correction, const int *xcoordinates, const int *ycoordinates, |