diff options
Diffstat (limited to 'native')
16 files changed, 354 insertions, 209 deletions
diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk index d2f22598e..c7061c8c3 100644 --- a/native/jni/NativeFileList.mk +++ b/native/jni/NativeFileList.mk @@ -32,7 +32,7 @@ LATIN_IME_CORE_SRC_FILES := \ error_type_utils.cpp \ multi_bigram_map.cpp \ suggestions_output_utils.cpp \ - unigram_property.cpp) \ + word_property.cpp) \ $(addprefix suggest/core/layout/, \ additional_proximity_chars.cpp \ proximity_info.cpp \ diff --git a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp index 586a306ec..c919ebd91 100644 --- a/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp +++ b/native/jni/com_android_inputmethod_latin_BinaryDictionary.cpp @@ -24,7 +24,7 @@ #include "jni.h" #include "jni_common.h" #include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/dictionary/unigram_property.h" +#include "suggest/core/dictionary/word_property.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/dictionary/structure/dictionary_structure_with_buffer_policy_factory.h" #include "suggest/policyimpl/dictionary/utils/dict_file_writing_utils.h" @@ -260,19 +260,39 @@ static jint latinime_BinaryDictionary_getBigramProbability(JNIEnv *env, jclass c word1Length); } -static void latinime_BinaryDictionary_getUnigramProperty(JNIEnv *env, jclass clazz, +// Method to iterate all words in the dictionary for makedict. +// If token is 0, this method newly starts iterating the dictionary. This method returns 0 when +// the dictionary does not have a next word. +static jint latinime_BinaryDictionary_getNextWord(JNIEnv *env, jclass clazz, + jlong dict, jint token, jintArray outCodePoints) { + Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); + if (!dictionary) return 0; + const jsize outCodePointsLength = env->GetArrayLength(outCodePoints); + if (outCodePointsLength != MAX_WORD_LENGTH) { + AKLOGE("Invalid outCodePointsLength: %d", outCodePointsLength); + ASSERT(false); + return 0; + } + int wordCodePoints[outCodePointsLength]; + memset(wordCodePoints, 0, sizeof(wordCodePoints)); + const int nextToken = dictionary->getNextWordAndNextToken(token, wordCodePoints); + env->SetIntArrayRegion(outCodePoints, 0, outCodePointsLength, wordCodePoints); + return nextToken; +} + +static void latinime_BinaryDictionary_getWordProperty(JNIEnv *env, jclass clazz, jlong dict, jintArray word, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, - jobject outShortcutProbabilities) { + jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilityInfo, + jobject outShortcutTargets, jobject outShortcutProbabilities) { Dictionary *dictionary = reinterpret_cast<Dictionary *>(dict); if (!dictionary) return; const jsize wordLength = env->GetArrayLength(word); int wordCodePoints[wordLength]; env->GetIntArrayRegion(word, 0, wordLength, wordCodePoints); - const UnigramProperty unigramProperty = dictionary->getUnigramProperty( - wordCodePoints, wordLength); - unigramProperty.outputProperties(env, outCodePoints, outFlags, outProbability, - outHistoricalInfo, outShortcutTargets, outShortcutProbabilities); + const WordProperty wordProperty = dictionary->getWordProperty(wordCodePoints, wordLength); + wordProperty.outputProperties(env, outCodePoints, outFlags, outProbabilityInfo, + outBigramTargets, outBigramProbabilityInfo, outShortcutTargets, + outShortcutProbabilities); } static jfloat latinime_BinaryDictionary_calcNormalizedScore(JNIEnv *env, jclass clazz, @@ -521,9 +541,15 @@ static const JNINativeMethod sMethods[] = { reinterpret_cast<void *>(latinime_BinaryDictionary_getBigramProbability) }, { - const_cast<char *>("getUnigramPropertyNative"), - const_cast<char *>("(J[I[I[Z[I[ILjava/util/ArrayList;Ljava/util/ArrayList;)V"), - reinterpret_cast<void *>(latinime_BinaryDictionary_getUnigramProperty) + const_cast<char *>("getWordPropertyNative"), + const_cast<char *>("(J[I[I[Z[ILjava/util/ArrayList;Ljava/util/ArrayList;" + "Ljava/util/ArrayList;Ljava/util/ArrayList;)V"), + reinterpret_cast<void *>(latinime_BinaryDictionary_getWordProperty) + }, + { + const_cast<char *>("getNextWordNative"), + const_cast<char *>("(JI[I)I"), + reinterpret_cast<void *>(latinime_BinaryDictionary_getNextWord) }, { const_cast<char *>("calcNormalizedScoreNative"), diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index e68c0a6d8..9b71eff7a 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -143,13 +143,19 @@ void Dictionary::getProperty(const char *const query, const int queryLength, cha maxResultLength); } -const UnigramProperty Dictionary::getUnigramProperty(const int *const codePoints, +const WordProperty Dictionary::getWordProperty(const int *const codePoints, const int codePointCount) { TimeKeeper::setCurrentTime(); - return mDictionaryStructureWithBufferPolicy.get()->getUnigramProperty( + return mDictionaryStructureWithBufferPolicy.get()->getWordProperty( codePoints, codePointCount); } +int Dictionary::getNextWordAndNextToken(const int token, int *const outCodePoints) { + TimeKeeper::setCurrentTime(); + return mDictionaryStructureWithBufferPolicy.get()->getNextWordAndNextToken( + token, outCodePoints); +} + void Dictionary::logDictionaryInfo(JNIEnv *const env) const { int dictionaryIdCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; int versionStringCodePointBuffer[HEADER_ATTRIBUTE_BUFFER_SIZE]; diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index b37b4aa18..0a413cb52 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -22,7 +22,7 @@ #include "defines.h" #include "jni.h" #include "suggest/core/dictionary/bigram_dictionary.h" -#include "suggest/core/dictionary/unigram_property.h" +#include "suggest/core/dictionary/word_property.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" #include "suggest/core/suggest_interface.h" @@ -34,7 +34,7 @@ class DictionaryStructureWithBufferPolicy; class DicTraverseSession; class ProximityInfo; class SuggestOptions; -class UnigramProperty; +class WordProperty; class Dictionary { public: @@ -94,7 +94,12 @@ class Dictionary { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); - const UnigramProperty getUnigramProperty(const int *const codePoints, const int codePointCount); + const WordProperty getWordProperty(const int *const codePoints, const int codePointCount); + + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + int getNextWordAndNextToken(const int token, int *const outCodePoints); const DictionaryStructureWithBufferPolicy *getDictionaryStructurePolicy() const { return mDictionaryStructureWithBufferPolicy.get(); diff --git a/native/jni/src/suggest/core/dictionary/unigram_property.cpp b/native/jni/src/suggest/core/dictionary/unigram_property.cpp deleted file mode 100644 index 16bbb69d8..000000000 --- a/native/jni/src/suggest/core/dictionary/unigram_property.cpp +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "suggest/core/dictionary/unigram_property.h" - -namespace latinime { - -void UnigramProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, - jbooleanArray outFlags, jintArray outProbability, jintArray outHistoricalInfo, - jobject outShortcutTargets, jobject outShortcutProbabilities) const { - env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePointCount, mCodePoints); - jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts}; - env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); - env->SetIntArrayRegion(outProbability, 0 /* start */, 1 /* len */, &mProbability); - int historicalInfo[] = {mTimestamp, mLevel, mCount}; - env->SetIntArrayRegion(outHistoricalInfo, 0 /* start */, NELEMS(historicalInfo), - historicalInfo); - - jclass integerClass = env->FindClass("java/lang/Integer"); - jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V"); - jclass arrayListClass = env->FindClass("java/util/ArrayList"); - jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); - const int shortcutTargetCount = mShortcutTargets.size(); - for (int i = 0; i < shortcutTargetCount; ++i) { - jintArray shortcutTargetCodePointArray = env->NewIntArray(mShortcutTargets[i].size()); - env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, - mShortcutTargets[i].size(), &mShortcutTargets[i][0]); - env->CallVoidMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); - env->DeleteLocalRef(shortcutTargetCodePointArray); - jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, - mShortcutProbabilities[i]); - env->CallVoidMethod(outShortcutProbabilities, addMethodId, integerProbability); - env->DeleteLocalRef(integerProbability); - } - env->DeleteLocalRef(integerClass); - env->DeleteLocalRef(arrayListClass); -} - -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/unigram_property.h b/native/jni/src/suggest/core/dictionary/unigram_property.h deleted file mode 100644 index c4ebb86ab..000000000 --- a/native/jni/src/suggest/core/dictionary/unigram_property.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (C) 2013 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_UNIGRAM_PROPERTY_H -#define LATINIME_UNIGRAM_PROPERTY_H - -#include <cstring> -#include <vector> - -#include "defines.h" -#include "jni.h" - -namespace latinime { - -// This class is used for returning information belonging to a unigram to java side. -class UnigramProperty { - public: - // Invalid unigram. - UnigramProperty() - : mCodePoints(), mCodePointCount(0), mIsNotAWord(false), mIsBlacklisted(false), - mHasBigrams(false), mHasShortcuts(false), mProbability(NOT_A_PROBABILITY), - mTimestamp(0), mLevel(0), mCount(0), mShortcutTargets(), mShortcutProbabilities() {} - - UnigramProperty(const UnigramProperty &unigramProperty) - : mCodePoints(), mCodePointCount(unigramProperty.mCodePointCount), - mIsNotAWord(unigramProperty.mIsNotAWord), - mIsBlacklisted(unigramProperty.mIsBlacklisted), - mHasBigrams(unigramProperty.mHasBigrams), - mHasShortcuts(unigramProperty.mHasShortcuts), - mProbability(unigramProperty.mProbability), - mTimestamp(unigramProperty.mTimestamp), mLevel(unigramProperty.mLevel), - mCount(unigramProperty.mCount), mShortcutTargets(unigramProperty.mShortcutTargets), - mShortcutProbabilities(unigramProperty.mShortcutProbabilities) { - memcpy(mCodePoints, unigramProperty.mCodePoints, sizeof(mCodePoints)); - } - - UnigramProperty(const int *const codePoints, const int codePointCount, - const bool isNotAWord, const bool isBlacklisted, const bool hasBigrams, - const bool hasShortcuts, const int probability, const int timestamp, - const int level, const int count, - const std::vector<std::vector<int> > *const shortcutTargets, - const std::vector<int> *const shortcutProbabilities) - : mCodePoints(), mCodePointCount(codePointCount), - mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), mHasBigrams(hasBigrams), - mHasShortcuts(hasShortcuts), mProbability(probability), mTimestamp(timestamp), - mLevel(level), mCount(count), mShortcutTargets(*shortcutTargets), - mShortcutProbabilities(*shortcutProbabilities) { - memcpy(mCodePoints, codePoints, sizeof(mCodePoints)); - } - - void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, - jintArray outProbability, jintArray outHistoricalInfo, jobject outShortcutTargets, - jobject outShortcutProbabilities) const; - - private: - DISALLOW_ASSIGNMENT_OPERATOR(UnigramProperty); - - int mCodePoints[MAX_WORD_LENGTH]; - int mCodePointCount; - bool mIsNotAWord; - bool mIsBlacklisted; - bool mHasBigrams; - bool mHasShortcuts; - int mProbability; - // Historical information - int mTimestamp; - int mLevel; - int mCount; - // Shortcut - std::vector<std::vector<int> > mShortcutTargets; - std::vector<int> mShortcutProbabilities; -}; -} // namespace latinime -#endif // LATINIME_UNIGRAM_PROPERTY_H diff --git a/native/jni/src/suggest/core/dictionary/word_property.cpp b/native/jni/src/suggest/core/dictionary/word_property.cpp new file mode 100644 index 000000000..288e6b05e --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/word_property.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "suggest/core/dictionary/word_property.h" + +namespace latinime { + +void WordProperty::outputProperties(JNIEnv *const env, jintArray outCodePoints, + jbooleanArray outFlags, jintArray outProbabilityInfo, jobject outBigramTargets, + jobject outBigramProbabilities, jobject outShortcutTargets, + jobject outShortcutProbabilities) const { + env->SetIntArrayRegion(outCodePoints, 0 /* start */, mCodePoints.size(), &mCodePoints[0]); + jboolean flags[] = {mIsNotAWord, mIsBlacklisted, mHasBigrams, mHasShortcuts}; + env->SetBooleanArrayRegion(outFlags, 0 /* start */, NELEMS(flags), flags); + int probabilityInfo[] = {mProbability, mTimestamp, mLevel, mCount}; + env->SetIntArrayRegion(outProbabilityInfo, 0 /* start */, NELEMS(probabilityInfo), + probabilityInfo); + + jclass integerClass = env->FindClass("java/lang/Integer"); + jmethodID intToIntegerConstructorId = env->GetMethodID(integerClass, "<init>", "(I)V"); + jclass arrayListClass = env->FindClass("java/util/ArrayList"); + jmethodID addMethodId = env->GetMethodID(arrayListClass, "add", "(Ljava/lang/Object;)Z"); + + // Output bigrams. + const int bigramCount = mBigrams.size(); + for (int i = 0; i < bigramCount; ++i) { + const BigramProperty *const bigramProperty = &mBigrams[i]; + const std::vector<int> *const word1CodePoints = bigramProperty->getTargetCodePoints(); + jintArray bigramWord1CodePointArray = env->NewIntArray(word1CodePoints->size()); + env->SetIntArrayRegion(bigramWord1CodePointArray, 0 /* start */, + word1CodePoints->size(), &word1CodePoints->at(0)); + env->CallVoidMethod(outBigramTargets, addMethodId, bigramWord1CodePointArray); + env->DeleteLocalRef(bigramWord1CodePointArray); + + int bigramProbabilityInfo[] = {bigramProperty->getProbability(), + bigramProperty->getTimestamp(), bigramProperty->getLevel(), + bigramProperty->getCount()}; + jintArray bigramProbabilityInfoArray = env->NewIntArray(NELEMS(bigramProbabilityInfo)); + env->SetIntArrayRegion(bigramProbabilityInfoArray, 0 /* start */, + NELEMS(bigramProbabilityInfo), bigramProbabilityInfo); + env->CallVoidMethod(outBigramProbabilities, addMethodId, bigramProbabilityInfoArray); + env->DeleteLocalRef(bigramProbabilityInfoArray); + } + + // Output shortcuts. + const int shortcutTargetCount = mShortcuts.size(); + for (int i = 0; i < shortcutTargetCount; ++i) { + const std::vector<int> *const targetCodePoints = mShortcuts[i].getTargetCodePoints(); + jintArray shortcutTargetCodePointArray = env->NewIntArray(targetCodePoints->size()); + env->SetIntArrayRegion(shortcutTargetCodePointArray, 0 /* start */, + targetCodePoints->size(), &targetCodePoints->at(0)); + env->CallVoidMethod(outShortcutTargets, addMethodId, shortcutTargetCodePointArray); + env->DeleteLocalRef(shortcutTargetCodePointArray); + jobject integerProbability = env->NewObject(integerClass, intToIntegerConstructorId, + mShortcuts[i].getProbability()); + env->CallVoidMethod(outShortcutProbabilities, addMethodId, integerProbability); + env->DeleteLocalRef(integerProbability); + } + env->DeleteLocalRef(integerClass); + env->DeleteLocalRef(arrayListClass); +} + +} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/word_property.h b/native/jni/src/suggest/core/dictionary/word_property.h new file mode 100644 index 000000000..40b1a91a4 --- /dev/null +++ b/native/jni/src/suggest/core/dictionary/word_property.h @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2013 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_WORD_PROPERTY_H +#define LATINIME_WORD_PROPERTY_H + +#include <cstring> +#include <vector> + +#include "defines.h" +#include "jni.h" + +namespace latinime { + +// This class is used for returning information belonging to a word to java side. +class WordProperty { + public: + class BigramProperty { + public: + BigramProperty(const std::vector<int> *const targetCodePoints, + const int probability, const int timestamp, const int level, const int count) + : mTargetCodePoints(*targetCodePoints), mProbability(probability), + mTimestamp(timestamp), mLevel(level), mCount(count) {} + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + int getTimestamp() const { + return mTimestamp; + } + + int getLevel() const { + return mLevel; + } + + int getCount() const { + return mCount; + } + + private: + std::vector<int> mTargetCodePoints; + int mProbability; + int mTimestamp; + int mLevel; + int mCount; + }; + + class ShortcutProperty { + public: + ShortcutProperty(const std::vector<int> *const targetCodePoints, const int probability) + : mTargetCodePoints(*targetCodePoints), mProbability(probability) {} + + const std::vector<int> *getTargetCodePoints() const { + return &mTargetCodePoints; + } + + int getProbability() const { + return mProbability; + } + + private: + std::vector<int> mTargetCodePoints; + int mProbability; + }; + + // Invalid word. + WordProperty() + : mCodePoints(), mIsNotAWord(false), mIsBlacklisted(false), + mHasBigrams(false), mHasShortcuts(false), mProbability(NOT_A_PROBABILITY), + mTimestamp(0), mLevel(0), mCount(0), mBigrams(), mShortcuts() {} + + WordProperty(const std::vector<int> *const codePoints, + const bool isNotAWord, const bool isBlacklisted, const bool hasBigrams, + const bool hasShortcuts, const int probability, const int timestamp, + const int level, const int count, const std::vector<BigramProperty> *const bigrams, + const std::vector<ShortcutProperty> *const shortcuts) + : mCodePoints(*codePoints), mIsNotAWord(isNotAWord), mIsBlacklisted(isBlacklisted), + mHasBigrams(hasBigrams), mHasShortcuts(hasShortcuts), mProbability(probability), + mTimestamp(timestamp), mLevel(level), mCount(count), mBigrams(*bigrams), + mShortcuts(*shortcuts) {} + + void outputProperties(JNIEnv *const env, jintArray outCodePoints, jbooleanArray outFlags, + jintArray outProbabilityInfo, jobject outBigramTargets, jobject outBigramProbabilities, + jobject outShortcutTargets, jobject outShortcutProbabilities) const; + + private: + DISALLOW_ASSIGNMENT_OPERATOR(WordProperty); + + std::vector<int> mCodePoints; + bool mIsNotAWord; + bool mIsBlacklisted; + bool mHasBigrams; + bool mHasShortcuts; + int mProbability; + // Historical information + int mTimestamp; + int mLevel; + int mCount; + std::vector<BigramProperty> mBigrams; + std::vector<ShortcutProperty> mShortcuts; +}; +} // namespace latinime +#endif // LATINIME_WORD_PROPERTY_H diff --git a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h index c74a4ebbe..784419586 100644 --- a/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h +++ b/native/jni/src/suggest/core/policy/dictionary_structure_with_buffer_policy.h @@ -18,7 +18,7 @@ #define LATINIME_DICTIONARY_STRUCTURE_POLICY_H #include "defines.h" -#include "suggest/core/dictionary/unigram_property.h" +#include "suggest/core/dictionary/word_property.h" #include "utils/exclusive_ownership_pointer.h" namespace latinime { @@ -92,9 +92,14 @@ class DictionaryStructureWithBufferPolicy { const int maxResultLength) = 0; // Used for testing. - virtual const UnigramProperty getUnigramProperty(const int *const codePonts, + virtual const WordProperty getWordProperty(const int *const codePonts, const int codePointCount) const = 0; + // Method to iterate all words in the dictionary. + // The returned token has to be used to get the next word. If token is 0, this method newly + // starts iterating the dictionary. + virtual int getNextWordAndNextToken(const int token, int *const outCodePoints) = 0; + protected: DictionaryStructureWithBufferPolicy() {} diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp index 37a5b3fe4..7504524f0 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.cpp @@ -24,7 +24,7 @@ const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY = "REQUIRES_GERMAN_UMLAUT_PROCESSING"; // TODO: Change attribute string to "IS_DECAYING_DICT". const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE"; -const char *const HeaderPolicy::LAST_UPDATED_TIME_KEY = "date"; +const char *const HeaderPolicy::DATE_KEY = "date"; const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME"; const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT"; const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT"; @@ -73,13 +73,13 @@ bool HeaderPolicy::readRequiresGermanUmlautProcessing() const { REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false); } -bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastUpdatedTime, - const bool updatesLastDecayedTime, const int unigramCount, const int bigramCount, +bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const { int writingPos = 0; HeaderReadWriteUtils::AttributeMap attributeMapToWrite(mAttributeMap); - fillInHeader(updatesLastDecayedTime, updatesLastDecayedTime, - unigramCount, bigramCount, extendedRegionSize, &attributeMapToWrite); + fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount, + extendedRegionSize, &attributeMapToWrite); if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion, &writingPos)) { return false; @@ -106,18 +106,16 @@ bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastUpdatedTim return true; } -void HeaderPolicy::fillInHeader(const bool updatesLastUpdatedTime, - const bool updatesLastDecayedTime, const int unigramCount, const int bigramCount, - const int extendedRegionSize, HeaderReadWriteUtils::AttributeMap *outAttributeMap) const { +void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount, + const int bigramCount, const int extendedRegionSize, + HeaderReadWriteUtils::AttributeMap *outAttributeMap) const { HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount); HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY, extendedRegionSize); - if (updatesLastUpdatedTime) { - // Set current time as the last updated time. - HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_UPDATED_TIME_KEY, - TimeKeeper::peekCurrentTime()); - } + // Set the current time as the generation time. + HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY, + TimeKeeper::peekCurrentTime()); if (updatesLastDecayedTime) { // Set current time as the last updated time. HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY, diff --git a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h index d65315212..a44f9f0fc 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/header/header_policy.h @@ -39,8 +39,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, @@ -62,10 +62,10 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, IS_DECAYING_DICT_KEY, false /* defaultValue */)), - mLastUpdatedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, - LAST_UPDATED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), + DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)) {} @@ -75,7 +75,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), mAttributeMap(), mMultiWordCostMultiplier(0.0f), mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), - mLastUpdatedTime(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), + mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false) {} ~HeaderPolicy() {} @@ -122,8 +122,8 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { return mRequiresGermanUmlautProcessing; } - AK_FORCE_INLINE int getLastUpdatedTime() const { - return mLastUpdatedTime; + AK_FORCE_INLINE int getDate() const { + return mDate; } AK_FORCE_INLINE int getLastDecayedTime() const { @@ -149,11 +149,11 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { void readHeaderValueOrQuestionMark(const char *const key, int *outValue, int outValueSize) const; - bool fillInAndWriteHeaderToBuffer(const bool updatesLastUpdatedTime, - const bool updatesLastDecayedTime, const int unigramCount, const int bigramCount, + bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, + const int unigramCount, const int bigramCount, const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const; - void fillInHeader(const bool updatesLastUpdatedTime, const bool updatesLastDecayedTime, + void fillInHeader(const bool updatesLastDecayedTime, const int unigramCount, const int bigramCount, const int extendedRegionSize, HeaderReadWriteUtils::AttributeMap *outAttributeMap) const; @@ -163,7 +163,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; static const char *const IS_DECAYING_DICT_KEY; - static const char *const LAST_UPDATED_TIME_KEY; + static const char *const DATE_KEY; static const char *const LAST_DECAYED_TIME_KEY; static const char *const UNIGRAM_COUNT_KEY; static const char *const BIGRAM_COUNT_KEY; @@ -179,7 +179,7 @@ class HeaderPolicy : public DictionaryHeaderStructurePolicy { const float mMultiWordCostMultiplier; const bool mRequiresGermanUmlautProcessing; const bool mIsDecayingDict; - const int mLastUpdatedTime; + const int mDate; const int mLastDecayedTime; const int mUnigramCount; const int mBigramCount; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h index 2adafd22b..319c81569 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.h @@ -123,10 +123,15 @@ class PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { } } - const UnigramProperty getUnigramProperty(const int *const codePoints, + const WordProperty getWordProperty(const int *const codePoints, const int codePointCount) const { - // getUnigramProperty is not supported. - return UnigramProperty(); + // getWordProperty is not supported. + return WordProperty(); + } + + int getNextWordAndNextToken(const int token, int *const outCodePoints) { + // getNextWordAndNextToken is not supported. + return 0; } private: diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp index b4730fe68..1c420e070 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.cpp @@ -20,7 +20,7 @@ #include "suggest/core/dicnode/dic_node.h" #include "suggest/core/dicnode/dic_node_vector.h" -#include "suggest/core/dictionary/unigram_property.h" +#include "suggest/core/dictionary/word_property.h" #include "suggest/policyimpl/dictionary/structure/pt_common/dynamic_pt_reading_helper.h" #include "suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_node_reader.h" #include "suggest/policyimpl/dictionary/utils/forgetting_curve_utils.h" @@ -317,22 +317,59 @@ void Ver4PatriciaTriePolicy::getProperty(const char *const query, const int quer } } -const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *const codePoints, +const WordProperty Ver4PatriciaTriePolicy::getWordProperty(const int *const codePoints, const int codePointCount) const { const int ptNodePos = getTerminalPtNodePositionOfWord(codePoints, codePointCount, false /* forceLowerCaseSearch */); if (ptNodePos == NOT_A_DICT_POS) { - AKLOGE("fetchUnigramProperty is called for invalid word."); - return UnigramProperty(); + AKLOGE("getWordProperty is called for invalid word."); + return WordProperty(); } const PtNodeParams ptNodeParams = mNodeReader.fetchNodeInfoInBufferFromPtNodePos(ptNodePos); + std::vector<int> codePointVector(ptNodeParams.getCodePoints(), + ptNodeParams.getCodePoints() + ptNodeParams.getCodePointCount()); const ProbabilityEntry probabilityEntry = mBuffers.get()->getProbabilityDictContent()->getProbabilityEntry( ptNodeParams.getTerminalId()); const HistoricalInfo *const historicalInfo = probabilityEntry.getHistoricalInfo(); + // Fetch bigram information. + std::vector<WordProperty::BigramProperty> bigrams; + const int bigramListPos = getBigramsPositionOfPtNode(ptNodePos); + if (bigramListPos != NOT_A_DICT_POS) { + int bigramWord1CodePoints[MAX_WORD_LENGTH]; + const BigramDictContent *const bigramDictContent = mBuffers.get()->getBigramDictContent(); + const TerminalPositionLookupTable *const terminalPositionLookupTable = + mBuffers.get()->getTerminalPositionLookupTable(); + bool hasNext = true; + int readingPos = bigramListPos; + while (hasNext) { + const BigramEntry bigramEntry = + bigramDictContent->getBigramEntryAndAdvancePosition(&readingPos); + hasNext = bigramEntry.hasNext(); + const int word1TerminalId = bigramEntry.getTargetTerminalId(); + const int word1TerminalPtNodePos = + terminalPositionLookupTable->getTerminalPtNodePosition(word1TerminalId); + if (word1TerminalPtNodePos == NOT_A_DICT_POS) { + continue; + } + // Word (unigram) probability + int word1Probability = NOT_A_PROBABILITY; + const int codePointCount = getCodePointsAndProbabilityAndReturnCodePointCount( + word1TerminalPtNodePos, MAX_WORD_LENGTH, bigramWord1CodePoints, + &word1Probability); + std::vector<int> word1(bigramWord1CodePoints, + bigramWord1CodePoints + codePointCount); + const HistoricalInfo *const historicalInfo = bigramEntry.getHistoricalInfo(); + const int probability = bigramEntry.hasHistoricalInfo() ? + ForgettingCurveUtils::decodeProbability(bigramEntry.getHistoricalInfo()) : + bigramEntry.getProbability(); + bigrams.push_back(WordProperty::BigramProperty(&word1, probability, + historicalInfo->getTimeStamp(), historicalInfo->getLevel(), + historicalInfo->getCount())); + } + } // Fetch shortcut information. - std::vector<std::vector<int> > shortcutTargets; - std::vector<int> shortcutProbabilities; + std::vector<WordProperty::ShortcutProperty> shortcuts; int shortcutPos = getShortcutPositionOfPtNode(ptNodePos); if (shortcutPos != NOT_A_DICT_POS) { int shortcutTarget[MAX_WORD_LENGTH]; @@ -345,15 +382,20 @@ const UnigramProperty Ver4PatriciaTriePolicy::getUnigramProperty(const int *cons shortcutDictContent->getShortcutEntryAndAdvancePosition(MAX_WORD_LENGTH, shortcutTarget, &shortcutTargetLength, &shortcutProbability, &hasNext, &shortcutPos); std::vector<int> target(shortcutTarget, shortcutTarget + shortcutTargetLength); - shortcutTargets.push_back(target); - shortcutProbabilities.push_back(shortcutProbability); + shortcuts.push_back(WordProperty::ShortcutProperty(&target, shortcutProbability)); } } - return UnigramProperty(ptNodeParams.getCodePoints(), ptNodeParams.getCodePointCount(), - ptNodeParams.isNotAWord(), ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), + return WordProperty(&codePointVector, ptNodeParams.isNotAWord(), + ptNodeParams.isBlacklisted(), ptNodeParams.hasBigrams(), ptNodeParams.hasShortcutTargets(), ptNodeParams.getProbability(), historicalInfo->getTimeStamp(), historicalInfo->getLevel(), - historicalInfo->getCount(), &shortcutTargets, &shortcutProbabilities); + historicalInfo->getCount(), &bigrams, &shortcuts); +} + +int Ver4PatriciaTriePolicy::getNextWordAndNextToken(const int token, + int *const outCodePoints) { + // TODO: Implement. + return 0; } } // namespace latinime diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h index 81aed20a3..1bcd4ceea 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_policy.h @@ -106,9 +106,11 @@ class Ver4PatriciaTriePolicy : public DictionaryStructureWithBufferPolicy { void getProperty(const char *const query, const int queryLength, char *const outResult, const int maxResultLength); - const UnigramProperty getUnigramProperty(const int *const codePoints, + const WordProperty getWordProperty(const int *const codePoints, const int codePointCount) const; + int getNextWordAndNextToken(const int token, int *const outCodePoints); + private: DISALLOW_IMPLICIT_CONSTRUCTORS(Ver4PatriciaTriePolicy); diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp index b6f813c75..672097455 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v4/ver4_patricia_trie_writing_helper.cpp @@ -39,12 +39,11 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFile(const char *const dictDirPat BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); const int extendedRegionSize = headerPolicy->getExtendedRegionSize() + mBuffers->getTrieBuffer()->getUsedAdditionalBufferSize(); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastUpdatedTime */, - false /* updatesLastDecayedTime */, unigramCount, bigramCount, extendedRegionSize, - &headerBuffer)) { - AKLOGE("Cannot write header structure to buffer. updatesLastUpdatedTime: %d, " + if (!headerPolicy->fillInAndWriteHeaderToBuffer(false /* updatesLastDecayedTime */, + unigramCount, bigramCount, extendedRegionSize, &headerBuffer)) { + AKLOGE("Cannot write header structure to buffer. " "updatesLastDecayedTime: %d, unigramCount: %d, bigramCount: %d, " - "extendedRegionSize: %d", false, false, unigramCount, bigramCount, + "extendedRegionSize: %d", false, unigramCount, bigramCount, extendedRegionSize); return; } @@ -63,9 +62,8 @@ void Ver4PatriciaTrieWritingHelper::writeToDictFileWithGC(const int rootPtNodeAr } BufferWithExtendableBuffer headerBuffer( BufferWithExtendableBuffer::DEFAULT_MAX_ADDITIONAL_BUFFER_SIZE); - if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastUpdatedTime */, - true /* updatesLastDecayedTime */, unigramCount, bigramCount, - 0 /* extendedRegionSize */, &headerBuffer)) { + if (!headerPolicy->fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + unigramCount, bigramCount, 0 /* extendedRegionSize */, &headerBuffer)) { return; } dictBuffers.get()->flushHeaderAndDictBuffers(dictDirPath, &headerBuffer); diff --git a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp index 1e57a0b5d..84403c807 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/utils/dict_file_writing_utils.cpp @@ -48,8 +48,8 @@ const char *const DictFileWritingUtils::TEMP_FILE_SUFFIX_FOR_WRITING_DICT_FILE = HeaderPolicy headerPolicy(FormatUtils::VERSION_4, attributeMap); Ver4DictBuffers::Ver4DictBuffersPtr dictBuffers = Ver4DictBuffers::createVer4DictBuffers(&headerPolicy); - headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastUpdatedTime */, - true /* updatesLastDecayedTime */, 0 /* unigramCount */, 0 /* bigramCount */, + headerPolicy.fillInAndWriteHeaderToBuffer(true /* updatesLastDecayedTime */, + 0 /* unigramCount */, 0 /* bigramCount */, 0 /* extendedRegionSize */, dictBuffers.get()->getWritableHeaderBuffer()); if (!DynamicPtWritingUtils::writeEmptyDictionary( dictBuffers.get()->getWritableTrieBuffer(), 0 /* rootPos */)) { |