aboutsummaryrefslogtreecommitdiffstats
path: root/java
diff options
context:
space:
mode:
authorJean Chalard <jchalard@google.com>2012-10-23 12:07:57 +0900
committerJean Chalard <jchalard@google.com>2012-10-23 17:17:37 +0900
commitc59c7419878d91852420bcc6663e7dc6aaf446fd (patch)
tree20ed061375cb911bbaf754557a5dc3ccf1ffea3f /java
parent7f77aed5070981ffa5ccb88b233664962c712f40 (diff)
downloadlatinime-c59c7419878d91852420bcc6663e7dc6aaf446fd.tar.gz
latinime-c59c7419878d91852420bcc6663e7dc6aaf446fd.tar.xz
latinime-c59c7419878d91852420bcc6663e7dc6aaf446fd.zip
Return the correct bigram frequency
The "correct" bigram frequency is now returned by the reading code. However, as the binary format represents the frequency in a lossy manner, the frequency is not guaranteed to be the exact same as the one in the source text format - only a close enough value. It is however the exact same value seen by the native code. Bug: 7395653 Change-Id: I49199ef18901c671189912b3550623e9643baedd
Diffstat (limited to 'java')
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java33
1 files changed, 21 insertions, 12 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
index 277796dd2..da5236974 100644
--- a/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
+++ b/java/src/com/android/inputmethod/latin/makedict/BinaryDictInputOutput.java
@@ -1374,7 +1374,8 @@ public final class BinaryDictInputOutput {
// of this method. Since it performs direct, unbuffered random access to the file and
// may be called hundreds of thousands of times, the resulting performance is not
// reasonable without some kind of cache. Thus:
- private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
+ private static TreeMap<Integer, WeightedString> wordCache =
+ new TreeMap<Integer, WeightedString>();
/**
* Finds, as a string, the word at the address passed as an argument.
*
@@ -1382,15 +1383,15 @@ public final class BinaryDictInputOutput {
* @param headerSize the size of the header.
* @param address the address to seek.
* @param formatOptions file format options.
- * @return the word, as a string.
+ * @return the word with its frequency, as a weighted string.
*/
- /* packages for tests */ static String getWordAtAddress(
+ /* package for tests */ static WeightedString getWordAtAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions formatOptions) {
- final String cachedString = wordCache.get(address);
+ final WeightedString cachedString = wordCache.get(address);
if (null != cachedString) return cachedString;
- final String result;
+ final WeightedString result;
final int originalPointer = buffer.position();
buffer.position(address);
@@ -1406,14 +1407,17 @@ public final class BinaryDictInputOutput {
return result;
}
+ // TODO: static!? This will behave erratically when used in multi-threaded code.
+ // We need to fix this
private static int[] sGetWordBuffer = new int[FormatSpec.MAX_WORD_LENGTH];
- private static String getWordAtAddressWithParentAddress(
+ private static WeightedString getWordAtAddressWithParentAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions options) {
final StringBuilder builder = new StringBuilder();
int currentAddress = address;
int index = FormatSpec.MAX_WORD_LENGTH - 1;
+ int frequency = Integer.MIN_VALUE;
// the length of the path from the root to the leaf is limited by MAX_WORD_LENGTH
for (int count = 0; count < FormatSpec.MAX_WORD_LENGTH; ++count) {
CharGroupInfo currentInfo;
@@ -1428,6 +1432,7 @@ public final class BinaryDictInputOutput {
MakedictLog.d("Too many jumps - probably a bug");
}
} while (isMovedGroup(currentInfo.mFlags, options));
+ if (Integer.MIN_VALUE == frequency) frequency = currentInfo.mFrequency;
for (int i = 0; i < currentInfo.mCharacters.length; ++i) {
sGetWordBuffer[index--] =
currentInfo.mCharacters[currentInfo.mCharacters.length - i - 1];
@@ -1436,17 +1441,19 @@ public final class BinaryDictInputOutput {
currentAddress = currentInfo.mParentAddress + currentInfo.mOriginalAddress;
}
- return new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1);
+ return new WeightedString(
+ new String(sGetWordBuffer, index + 1, FormatSpec.MAX_WORD_LENGTH - index - 1),
+ frequency);
}
- private static String getWordAtAddressWithoutParentAddress(
+ private static WeightedString getWordAtAddressWithoutParentAddress(
final FusionDictionaryBufferInterface buffer, final int headerSize, final int address,
final FormatOptions options) {
buffer.position(headerSize);
final int count = readCharGroupCount(buffer);
int groupOffset = getGroupCountSize(count);
final StringBuilder builder = new StringBuilder();
- String result = null;
+ WeightedString result = null;
CharGroupInfo last = null;
for (int i = count - 1; i >= 0; --i) {
@@ -1454,7 +1461,7 @@ public final class BinaryDictInputOutput {
groupOffset = info.mEndAddress;
if (info.mOriginalAddress == address) {
builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
- result = builder.toString();
+ result = new WeightedString(builder.toString(), info.mFrequency);
break; // and return
}
if (hasChildrenAddress(info.mChildrenAddress)) {
@@ -1515,9 +1522,11 @@ public final class BinaryDictInputOutput {
if (null != info.mBigrams) {
bigrams = new ArrayList<WeightedString>();
for (PendingAttribute bigram : info.mBigrams) {
- final String word = getWordAtAddress(
+ final WeightedString word = getWordAtAddress(
buffer, headerSize, bigram.mAddress, options);
- bigrams.add(new WeightedString(word, bigram.mFrequency));
+ final int reconstructedFrequency =
+ reconstructBigramFrequency(word.mFrequency, bigram.mFrequency);
+ bigrams.add(new WeightedString(word.mWord, reconstructedFrequency));
}
}
if (hasChildrenAddress(info.mChildrenAddress)) {