diff options
Diffstat (limited to 'java/src/com/android/inputmethod/latin/makedict')
-rw-r--r-- | java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java | 40 |
1 files changed, 30 insertions, 10 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java index d82d503c4..3c818cc56 100644 --- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java +++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java @@ -174,6 +174,7 @@ public class BinaryDictInputOutput { private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767 private static final int MAX_TERMINAL_FREQUENCY = 255; + private static final int MAX_BIGRAM_FREQUENCY = 15; // Arbitrary limit to how much passes we consider address size compression should // terminate in. At the time of this writing, our largest dictionary completes @@ -722,15 +723,17 @@ public class BinaryDictInputOutput { } /** - * Makes the flag value for an attribute. + * Makes the flag value for a bigram. * - * @param more whether there are more attributes after this one. - * @param offset the offset of the attribute. - * @param frequency the frequency of the attribute, 0..15 + * @param more whether there are more bigrams after this one. + * @param offset the offset of the bigram. + * @param bigramFrequency the frequency of the bigram, 0..255. + * @param unigramFrequency the unigram frequency of the same word, 0..255. + * @param word the second bigram, for debugging purposes * @return the flags */ - private static final int makeAttributeFlags(final boolean more, final int offset, - final int frequency) { + private static final int makeBigramFlags(final boolean more, final int offset, + int bigramFrequency, final int unigramFrequency, final String word) { int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0); switch (getByteSize(offset)) { @@ -746,7 +749,21 @@ public class BinaryDictInputOutput { default: throw new RuntimeException("Strange offset size"); } - bigramFlags += frequency & FLAG_ATTRIBUTE_FREQUENCY; + if (unigramFrequency > bigramFrequency) { + MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word + + "\". Bigram freq is " + bigramFrequency + ", unigram freq for " + + word + " is " + unigramFrequency); + bigramFrequency = unigramFrequency; + } + // We compute the difference between 255 (which means probability = 1) and the + // unigram score. We split this into discrete 16 steps, and this is the value + // we store into the 4 bits of the bigrams frequency. + final float bigramRatio = (float)(bigramFrequency - unigramFrequency) + / (MAX_TERMINAL_FREQUENCY - unigramFrequency); + // TODO: if the bigram freq is very close to the unigram frequency, we don't want + // to include the bigram in the binary dictionary at all. + final int discretizedFrequency = Math.round(bigramRatio * MAX_BIGRAM_FREQUENCY); + bigramFlags += discretizedFrequency & FLAG_ATTRIBUTE_FREQUENCY; return bigramFlags; } @@ -854,11 +871,14 @@ public class BinaryDictInputOutput { final Iterator bigramIterator = group.mBigrams.iterator(); while (bigramIterator.hasNext()) { final WeightedString bigram = (WeightedString)bigramIterator.next(); - final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); + final CharGroup target = + FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord); + final int addressOfBigram = target.mCachedAddress; + final int unigramFrequencyForThisWord = target.mFrequency; ++groupAddress; final int offset = addressOfBigram - groupAddress; - int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, - bigram.mFrequency); + int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset, + bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord); buffer[index++] = (byte)bigramFlags; final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); index += bigramShift; |