aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-10-22 18:15:53 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2014-10-22 18:15:53 +0900
commitb5ef884fbb6bfd08ce793604cdf7f0941e958a84 (patch)
tree4105ac7240773a2421a5ee44128a5fe2862ad430
parent1249395563d43c818e12038231ec89dcbcdc5cd0 (diff)
downloadlatinime-b5ef884fbb6bfd08ce793604cdf7f0941e958a84.tar.gz
latinime-b5ef884fbb6bfd08ce793604cdf7f0941e958a84.tar.xz
latinime-b5ef884fbb6bfd08ce793604cdf7f0941e958a84.zip
Support dumping ngram entries.
Bug: 14425059 Change-Id: Ib03a0c3d166ed6f1e60c67127b28006d55143b6b
-rw-r--r--java/src/com/android/inputmethod/latin/makedict/WordProperty.java19
-rw-r--r--java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java18
-rw-r--r--native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp5
-rw-r--r--native/jni/src/suggest/core/dictionary/property/word_property.cpp48
-rw-r--r--native/jni/src/suggest/core/dictionary/property/word_property.h6
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp4
-rw-r--r--native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp12
7 files changed, 77 insertions, 35 deletions
diff --git a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java
index b129c3e40..e7808e46e 100644
--- a/java/src/com/android/inputmethod/latin/makedict/WordProperty.java
+++ b/java/src/com/android/inputmethod/latin/makedict/WordProperty.java
@@ -87,7 +87,7 @@ public final class WordProperty implements Comparable<WordProperty> {
final boolean isPossiblyOffensive, final boolean hasBigram, final boolean hasShortcuts,
final boolean isBeginningOfSentence, final int[] probabilityInfo,
final ArrayList<int[][]> ngramPrevWordsArray,
- final ArrayList<boolean[]> outNgramPrevWordIsBeginningOfSentenceArray,
+ final ArrayList<boolean[]> ngramPrevWordIsBeginningOfSentenceArray,
final ArrayList<int[]> ngramTargets, final ArrayList<int[]> ngramProbabilityInfo,
final ArrayList<int[]> shortcutTargets,
final ArrayList<Integer> shortcutProbabilities) {
@@ -102,16 +102,22 @@ public final class WordProperty implements Comparable<WordProperty> {
mHasNgrams = hasBigram;
final int relatedNgramCount = ngramTargets.size();
- final WordInfo currentWordInfo =
- mIsBeginningOfSentence ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
- : new WordInfo(mWord);
- final NgramContext ngramContext = new NgramContext(currentWordInfo);
for (int i = 0; i < relatedNgramCount; i++) {
final String ngramTargetString =
StringUtils.getStringFromNullTerminatedCodePointArray(ngramTargets.get(i));
final WeightedString ngramTarget = new WeightedString(ngramTargetString,
createProbabilityInfoFromArray(ngramProbabilityInfo.get(i)));
- // TODO: Support n-gram.
+ final int[][] prevWords = ngramPrevWordsArray.get(i);
+ final boolean[] isBeginningOfSentenceArray =
+ ngramPrevWordIsBeginningOfSentenceArray.get(i);
+ final WordInfo[] wordInfoArray = new WordInfo[prevWords.length];
+ for (int j = 0; j < prevWords.length; j++) {
+ wordInfoArray[j] = isBeginningOfSentenceArray[j]
+ ? WordInfo.BEGINNING_OF_SENTENCE_WORD_INFO
+ : new WordInfo(StringUtils.getStringFromNullTerminatedCodePointArray(
+ prevWords[j]));
+ }
+ final NgramContext ngramContext = new NgramContext(wordInfoArray);
ngrams.add(new NgramProperty(ngramTarget, ngramContext));
}
mNgrams = ngrams.isEmpty() ? null : ngrams;
@@ -126,6 +132,7 @@ public final class WordProperty implements Comparable<WordProperty> {
}
// TODO: Remove
+ @UsedForTesting
public ArrayList<WeightedString> getBigrams() {
if (null == mNgrams) {
return null;
diff --git a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java
index 248246232..4e0f5f583 100644
--- a/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java
+++ b/java/src/com/android/inputmethod/latin/utils/CombinedFormatUtils.java
@@ -17,6 +17,7 @@
package com.android.inputmethod.latin.utils;
import com.android.inputmethod.latin.makedict.DictionaryHeader;
+import com.android.inputmethod.latin.makedict.NgramProperty;
import com.android.inputmethod.latin.makedict.ProbabilityInfo;
import com.android.inputmethod.latin.makedict.WeightedString;
import com.android.inputmethod.latin.makedict.WordProperty;
@@ -26,6 +27,8 @@ import java.util.HashMap;
public class CombinedFormatUtils {
public static final String DICTIONARY_TAG = "dictionary";
public static final String BIGRAM_TAG = "bigram";
+ public static final String NGRAM_TAG = "ngram";
+ public static final String NGRAM_PREV_WORD_TAG = "prev_word";
public static final String SHORTCUT_TAG = "shortcut";
public static final String PROBABILITY_TAG = "f";
public static final String HISTORICAL_INFO_TAG = "historicalInfo";
@@ -76,12 +79,19 @@ public class CombinedFormatUtils {
}
}
if (wordProperty.mHasNgrams) {
- // TODO: Support ngram.
- for (final WeightedString bigram : wordProperty.getBigrams()) {
- builder.append(" " + BIGRAM_TAG + "=" + bigram.mWord);
+ for (final NgramProperty ngramProperty : wordProperty.mNgrams) {
+ builder.append(" " + NGRAM_TAG + "=" + ngramProperty.mTargetWord.mWord);
builder.append(",");
- builder.append(formatProbabilityInfo(bigram.mProbabilityInfo));
+ builder.append(formatProbabilityInfo(ngramProperty.mTargetWord.mProbabilityInfo));
builder.append("\n");
+ for (int i = 0; i < ngramProperty.mNgramContext.getPrevWordCount(); i++) {
+ builder.append(" " + NGRAM_PREV_WORD_TAG + "[" + i + "]="
+ + ngramProperty.mNgramContext.getNthPrevWord(i + 1));
+ if (ngramProperty.mNgramContext.isNthPrevWordBeginningOfSontence(i + 1)) {
+ builder.append("," + BEGINNING_OF_SENTENCE_TAG + "=true");
+ }
+ builder.append("\n");
+ }
}
}
return builder.toString();
diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
index 461d1d859..9239c8400 100644
--- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
+++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp
@@ -327,8 +327,8 @@ static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz,
static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
jlong dict, jintArray word, jboolean isBeginningOfSentence, jintArray outCodePoints,
- jbooleanArray outFlags, jintArray outProbabilityInfo, jobject /* outNgramPrevWordsArray */,
- jobject /* outNgramPrevWordIsBeginningOfSentenceArray */, jobject outNgramTargets,
+ jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
+ jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
jobject outNgramProbabilityInfo, jobject outShortcutTargets,
jobject outShortcutProbabilities) {
Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict);
@@ -352,6 +352,7 @@ static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz,
const WordProperty wordProperty = dictionary->getWordProperty(
CodePointArrayView(wordCodePoints, codePointCount));
wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo,
+ outNgramPrevWordsArray, outNgramPrevWordIsBeginningOfSentenceArray,
outNgramTargets, outNgramProbabilityInfo, outShortcutTargets,
outShortcutProbabilities);
}
diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.cpp b/native/jni/src/suggest/core/dictionary/property/word_property.cpp
index a707f1ba2..019f0880f 100644
--- a/native/jni/src/suggest/core/dictionary/property/word_property.cpp
+++ b/native/jni/src/suggest/core/dictionary/property/word_property.cpp
@@ -22,8 +22,9 @@
namespace latinime {
void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
- jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets,
- jobject outBigramProbabilities, jobject outShortcutTargets,
+ jbooleanArray outFlags, jintArray outProbabilityInfo,
+ jobject outNgramPrevWordsArray, jobject outNgramPrevWordIsBeginningOfSentenceArray,
+ jobject outNgramTargets, jobject outNgramProbabilities, jobject outShortcutTargets,
jobject outShortcutProbabilities) const {
JniDataUtils::outputCodePoints(env, outCodePoints, 0 /* start */,
MAX_WORD_LENGTH /* maxLength */, mCodePoints.data(), mCodePoints.size(),
@@ -43,16 +44,39 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jclass arrayListClass = env->FindClass("java/util/ArrayList");
jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z");
- // Output bigrams.
- // TODO: Support n-gram
+ // Output ngrams.
+ jclass intArrayClass = env->FindClass("[I");
for (const auto &ngramProperty : mNgrams) {
- const std::vector<int> *const word1CodePoints = ngramProperty.getTargetCodePoints();
- jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size());
- JniDataUtils::outputCodePoints(env, bigramWord1CodePointArray, 0 /* start */,
- word1CodePoints->size(), word1CodePoints->data(), word1CodePoints->size(),
- false /* needsNullTermination */);
- env->CallBooleanMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray);
- env->DeleteLocalRef(bigramWord1CodePointArray);
+ const NgramContext *const ngramContext = ngramProperty.getNgramContext();
+ jobjectArray prevWordWordCodePointsArray = env->NewObjectArray(
+ ngramContext->getPrevWordCount(), intArrayClass, nullptr);
+ jbooleanArray prevWordIsBeginningOfSentenceArray =
+ env->NewBooleanArray(ngramContext->getPrevWordCount());
+ for (size_t i = 0; i < ngramContext->getPrevWordCount(); ++i) {
+ const CodePointArrayView codePoints = ngramContext->getNthPrevWordCodePoints(i + 1);
+ jintArray prevWordCodePoints = env->NewIntArray(codePoints.size());
+ JniDataUtils::outputCodePoints(env, prevWordCodePoints, 0 /* start */,
+ codePoints.size(), codePoints.data(), codePoints.size(),
+ false /* needsNullTermination */);
+ env->SetObjectArrayElement(prevWordWordCodePointsArray, i, prevWordCodePoints);
+ env->DeleteLocalRef(prevWordCodePoints);
+ JniDataUtils::putBooleanToArray(env, prevWordIsBeginningOfSentenceArray, i,
+ ngramContext->isNthPrevWordBeginningOfSentence(i + 1));
+ }
+ env->CallBooleanMethod(outNgramPrevWordsArray, addMethodId, prevWordWordCodePointsArray);
+ env->CallBooleanMethod(outNgramPrevWordIsBeginningOfSentenceArray, addMethodId,
+ prevWordIsBeginningOfSentenceArray);
+ env->DeleteLocalRef(prevWordWordCodePointsArray);
+ env->DeleteLocalRef(prevWordIsBeginningOfSentenceArray);
+
+ const std::vector<int> *const targetWordCodePoints = ngramProperty.getTargetCodePoints();
+ jintArray targetWordCodePointArray = env->NewIntArray(targetWordCodePoints->size());
+ JniDataUtils::outputCodePoints(env, targetWordCodePointArray, 0 /* start */,
+ targetWordCodePoints->size(), targetWordCodePoints->data(),
+ targetWordCodePoints->size(), false /* needsNullTermination */);
+ env->CallBooleanMethod(outNgramTargets, addMethodId, targetWordCodePointArray);
+ env->DeleteLocalRef(targetWordCodePointArray);
+
const HistoricalInfo &ngramHistoricalInfo = ngramProperty.getHistoricalInfo();
int bigramProbabilityInfo[] = {ngramProperty.getProbability(),
ngramHistoricalInfo.getTimestamp(), ngramHistoricalInfo.getLevel(),
@@ -60,7 +84,7 @@ void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints,
jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo));
env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */,
NELEMS(bigramProbabilityInfo), bigramProbabilityInfo);
- env->CallBooleanMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray);
+ env->CallBooleanMethod(outNgramProbabilities, addMethodId, bigramProbabilityInfoArray);
env->DeleteLocalRef(bigramProbabilityInfoArray);
}
diff --git a/native/jni/src/suggest/core/dictionary/property/word_property.h b/native/jni/src/suggest/core/dictionary/property/word_property.h
index 01b8987b5..b5314faaa 100644
--- a/native/jni/src/suggest/core/dictionary/property/word_property.h
+++ b/native/jni/src/suggest/core/dictionary/property/word_property.h
@@ -39,8 +39,10 @@ class WordProperty {
mNgrams(*ngrams) {}
void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags,
- jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities,
- jobject outShortcutTargets, jobject outShortcutProbabilities) const;
+ jintArray outProbabilityInfo, jobject outNgramPrevWordsArray,
+ jobject outNgramPrevWordIsBeginningOfSentenceArray, jobject outNgramTargets,
+ jobject outNgramProbabilities, jobject outShortcutTargets,
+ jobject outShortcutProbabilities) const;
const UnigramProperty *getUnigramProperty() const {
return &mUnigramProperty;
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
index b96290437..509bd683b 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/content/language_model_dict_content.cpp
@@ -90,8 +90,8 @@ const WordAttributes LanguageModelDictContent::getWordAttributes(const WordIdArr
// TODO: Some flags in unigramProbabilityEntry should be overwritten by flags in
// probabilityEntry.
const ProbabilityEntry unigramProbabilityEntry = getProbabilityEntry(wordId);
- return WordAttributes(probability, unigramProbabilityEntry.isNotAWord(),
- unigramProbabilityEntry.isBlacklisted(),
+ return WordAttributes(probability, unigramProbabilityEntry.isBlacklisted(),
+ unigramProbabilityEntry.isNotAWord(),
unigramProbabilityEntry.isPossiblyOffensive());
}
// Cannot find the word.
diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
index 193326d82..249d822b2 100644
--- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
+++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp
@@ -488,9 +488,6 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
AKLOGE("getWordProperty is called for invalid word.");
return WordProperty();
}
- const int ptNodePos =
- mBuffers->getTerminalPositionLookupTable()->getTerminalPtNodePosition(wordId);
- const PtNodeParams ptNodeParams = mNodeReader.fetchPtNodeParamsInBufferFromPtNodePos(ptNodePos);
const LanguageModelDictContent *const languageModelDictContent =
mBuffers->getLanguageModelDictContent();
// Fetch ngram information.
@@ -541,12 +538,13 @@ const WordProperty Ver4PatriciaTriePolicy::getWordProperty(
shortcutProbability);
}
}
- const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(
- ptNodeParams.getTerminalId());
+ const WordAttributes wordAttributes = languageModelDictContent->getWordAttributes(
+ WordIdArrayView(), wordId, mHeaderPolicy);
+ const ProbabilityEntry probabilityEntry = languageModelDictContent->getProbabilityEntry(wordId);
const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo();
const UnigramProperty unigramProperty(probabilityEntry.representsBeginningOfSentence(),
- probabilityEntry.isNotAWord(), probabilityEntry.isBlacklisted(),
- probabilityEntry.isPossiblyOffensive(), probabilityEntry.getProbability(),
+ wordAttributes.isNotAWord(), wordAttributes.isBlacklisted(),
+ wordAttributes.isPossiblyOffensive(), wordAttributes.getProbability(),
*historicalInfo, std::move(shortcuts));
return WordProperty(wordCodePoints.toVector(), &unigramProperty, &ngrams);
}