diff options
Diffstat (limited to 'native/jni/src')
7 files changed, 115 insertions, 171 deletions
diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp b/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp deleted file mode 100644 index 295e760d6..000000000 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2010, The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define LOG_TAG "LatinIME: bigram_dictionary.cpp" - -#include "bigram_dictionary.h" - -#include <algorithm> -#include <cstring> - -#include "defines.h" -#include "suggest/core/dictionary/binary_dictionary_bigrams_iterator.h" -#include "suggest/core/dictionary/dictionary.h" -#include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" -#include "suggest/core/result/suggestion_results.h" -#include "suggest/core/session/prev_words_info.h" -#include "utils/char_utils.h" - -namespace latinime { - -BigramDictionary::BigramDictionary( - const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy) - : mDictionaryStructurePolicy(dictionaryStructurePolicy) { - if (DEBUG_DICT) { - AKLOGI("BigramDictionary - constructor"); - } -} - -BigramDictionary::~BigramDictionary() { -} - -/* Parameters : - * prevWordsInfo: Information of previous words to get the predictions. - * outSuggestionResults: SuggestionResults to put the predictions. - */ -void BigramDictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, - SuggestionResults *const outSuggestionResults) const { - int unigramProbability = 0; - int bigramCodePoints[MAX_WORD_LENGTH]; - BinaryDictionaryBigramsIterator bigramsIt = - prevWordsInfo->getBigramsIteratorForPrediction(mDictionaryStructurePolicy); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { - continue; - } - const int codePointCount = mDictionaryStructurePolicy-> - getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), - MAX_WORD_LENGTH, bigramCodePoints, &unigramProbability); - if (codePointCount <= 0) { - continue; - } - // Due to space constraints, the probability for bigrams is approximate - the lower the - // unigram probability, the worse the precision. The theoritical maximum error in - // resulting probability is 8 - although in the practice it's never bigger than 3 or 4 - // in very bad cases. This means that sometimes, we'll see some bigrams interverted - // here, but it can't get too bad. - const int probability = mDictionaryStructurePolicy->getProbability( - unigramProbability, bigramsIt.getProbability()); - outSuggestionResults->addPrediction(bigramCodePoints, codePointCount, probability); - } -} - -// Returns a pointer to the start of the bigram list. -// If the word is not found or has no bigrams, this function returns NOT_A_DICT_POS. -int BigramDictionary::getBigramListPositionForWord(const int *prevWord, const int prevWordLength, - const bool forceLowerCaseSearch) const { - if (0 >= prevWordLength) return NOT_A_DICT_POS; - int pos = mDictionaryStructurePolicy->getTerminalPtNodePositionOfWord(prevWord, prevWordLength, - forceLowerCaseSearch); - if (NOT_A_DICT_POS == pos) return NOT_A_DICT_POS; - return mDictionaryStructurePolicy->getBigramsPositionOfPtNode(pos); -} - -int BigramDictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, - const int *word1, int length1) const { - int nextWordPos = mDictionaryStructurePolicy->getTerminalPtNodePositionOfWord(word1, length1, - false /* forceLowerCaseSearch */); - if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; - BinaryDictionaryBigramsIterator bigramsIt = - prevWordsInfo->getBigramsIteratorForPrediction(mDictionaryStructurePolicy); - while (bigramsIt.hasNext()) { - bigramsIt.next(); - if (bigramsIt.getBigramPos() == nextWordPos - && bigramsIt.getProbability() != NOT_A_PROBABILITY) { - return mDictionaryStructurePolicy->getProbability( - mDictionaryStructurePolicy->getUnigramProbabilityOfPtNode(nextWordPos), - bigramsIt.getProbability()); - } - } - return NOT_A_PROBABILITY; -} - -// TODO: Move functions related to bigram to here -} // namespace latinime diff --git a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h b/native/jni/src/suggest/core/dictionary/bigram_dictionary.h deleted file mode 100644 index bd3aed1bd..000000000 --- a/native/jni/src/suggest/core/dictionary/bigram_dictionary.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (C) 2010 The Android Open Source Project - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef LATINIME_BIGRAM_DICTIONARY_H -#define LATINIME_BIGRAM_DICTIONARY_H - -#include "defines.h" - -namespace latinime { - -class DictionaryStructureWithBufferPolicy; -class PrevWordsInfo; -class SuggestionResults; - -class BigramDictionary { - public: - BigramDictionary(const DictionaryStructureWithBufferPolicy *const dictionaryStructurePolicy); - - void getPredictions(const PrevWordsInfo *const prevWordsInfo, - SuggestionResults *const outSuggestionResults) const; - int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, - const int *word1, int length1) const; - ~BigramDictionary(); - - private: - DISALLOW_IMPLICIT_CONSTRUCTORS(BigramDictionary); - - int getBigramListPositionForWord(const int *prevWord, const int prevWordLength, - const bool forceLowerCaseSearch) const; - - const DictionaryStructureWithBufferPolicy *const mDictionaryStructurePolicy; -}; -} // namespace latinime -#endif // LATINIME_BIGRAM_DICTIONARY_H diff --git a/native/jni/src/suggest/core/dictionary/dictionary.cpp b/native/jni/src/suggest/core/dictionary/dictionary.cpp index 228260216..fb25f757c 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.cpp +++ b/native/jni/src/suggest/core/dictionary/dictionary.cpp @@ -23,6 +23,7 @@ #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/result/suggestion_results.h" #include "suggest/core/session/dic_traverse_session.h" +#include "suggest/core/session/prev_words_info.h" #include "suggest/core/suggest.h" #include "suggest/core/suggest_options.h" #include "suggest/policyimpl/gesture/gesture_suggest_policy_factory.h" @@ -37,7 +38,6 @@ const int Dictionary::HEADER_ATTRIBUTE_BUFFER_SIZE = 32; Dictionary::Dictionary(JNIEnv *env, DictionaryStructureWithBufferPolicy::StructurePolicyPtr dictionaryStructureWithBufferPolicy) : mDictionaryStructureWithBufferPolicy(std::move(dictionaryStructureWithBufferPolicy)), - mBigramDictionary(mDictionaryStructureWithBufferPolicy.get()), mGestureSuggest(new Suggest(GestureSuggestPolicyFactory::getGestureSuggestPolicy())), mTypingSuggest(new Suggest(TypingSuggestPolicyFactory::getTypingSuggestPolicy())) { logDictionaryInfo(env); @@ -62,7 +62,29 @@ void Dictionary::getSuggestions(ProximityInfo *proximityInfo, DicTraverseSession void Dictionary::getPredictions(const PrevWordsInfo *const prevWordsInfo, SuggestionResults *const outSuggestionResults) const { TimeKeeper::setCurrentTime(); - mBigramDictionary.getPredictions(prevWordsInfo, outSuggestionResults); + int unigramProbability = 0; + int bigramCodePoints[MAX_WORD_LENGTH]; + BinaryDictionaryBigramsIterator bigramsIt = prevWordsInfo->getBigramsIteratorForPrediction( + mDictionaryStructureWithBufferPolicy.get()); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == NOT_A_DICT_POS) { + continue; + } + if (prevWordsInfo->isNthPrevWordBeginningOfSentence(1 /* n */) + && bigramsIt.getProbability() == NOT_A_PROBABILITY) { + continue; + } + const int codePointCount = mDictionaryStructureWithBufferPolicy-> + getCodePointsAndProbabilityAndReturnCodePointCount(bigramsIt.getBigramPos(), + MAX_WORD_LENGTH, bigramCodePoints, &unigramProbability); + if (codePointCount <= 0) { + continue; + } + const int probability = mDictionaryStructureWithBufferPolicy->getProbability( + unigramProbability, bigramsIt.getProbability()); + outSuggestionResults->addPrediction(bigramCodePoints, codePointCount, probability); + } } int Dictionary::getProbability(const int *word, int length) const { @@ -81,10 +103,24 @@ int Dictionary::getMaxProbabilityOfExactMatches(const int *word, int length) con mDictionaryStructureWithBufferPolicy.get(), word, length); } -int Dictionary::getBigramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, +int Dictionary::getNgramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, int length) const { TimeKeeper::setCurrentTime(); - return mBigramDictionary.getBigramProbability(prevWordsInfo, word, length); + int nextWordPos = mDictionaryStructureWithBufferPolicy->getTerminalPtNodePositionOfWord(word, + length, false /* forceLowerCaseSearch */); + if (NOT_A_DICT_POS == nextWordPos) return NOT_A_PROBABILITY; + BinaryDictionaryBigramsIterator bigramsIt = prevWordsInfo->getBigramsIteratorForPrediction( + mDictionaryStructureWithBufferPolicy.get()); + while (bigramsIt.hasNext()) { + bigramsIt.next(); + if (bigramsIt.getBigramPos() == nextWordPos + && bigramsIt.getProbability() != NOT_A_PROBABILITY) { + return mDictionaryStructureWithBufferPolicy->getProbability( + mDictionaryStructureWithBufferPolicy->getUnigramProbabilityOfPtNode( + nextWordPos), bigramsIt.getProbability()); + } + } + return NOT_A_PROBABILITY; } bool Dictionary::addUnigramEntry(const int *const word, const int length, diff --git a/native/jni/src/suggest/core/dictionary/dictionary.h b/native/jni/src/suggest/core/dictionary/dictionary.h index 247ee2421..3b41088fe 100644 --- a/native/jni/src/suggest/core/dictionary/dictionary.h +++ b/native/jni/src/suggest/core/dictionary/dictionary.h @@ -21,7 +21,6 @@ #include "defines.h" #include "jni.h" -#include "suggest/core/dictionary/bigram_dictionary.h" #include "suggest/core/dictionary/property/word_property.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/core/policy/dictionary_structure_with_buffer_policy.h" @@ -75,7 +74,7 @@ class Dictionary { int getMaxProbabilityOfExactMatches(const int *word, int length) const; - int getBigramProbability(const PrevWordsInfo *const prevWordsInfo, + int getNgramProbability(const PrevWordsInfo *const prevWordsInfo, const int *word, int length) const; bool addUnigramEntry(const int *const codePoints, const int codePointCount, @@ -119,7 +118,6 @@ class Dictionary { const DictionaryStructureWithBufferPolicy::StructurePolicyPtr mDictionaryStructureWithBufferPolicy; - const BigramDictionary mBigramDictionary; const SuggestInterfacePtr mGestureSuggest; const SuggestInterfacePtr mTypingSuggest; diff --git a/native/jni/src/suggest/core/session/prev_words_info.h b/native/jni/src/suggest/core/session/prev_words_info.h index 640f6a2fc..e350c6996 100644 --- a/native/jni/src/suggest/core/session/prev_words_info.h +++ b/native/jni/src/suggest/core/session/prev_words_info.h @@ -25,7 +25,6 @@ namespace latinime { // TODO: Support n-gram. -// This class does not take ownership of any code point buffers. class PrevWordsInfo { public: // No prev word information. @@ -33,21 +32,52 @@ class PrevWordsInfo { clear(); } + PrevWordsInfo(PrevWordsInfo &&prevWordsInfo) { + for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { + mPrevWordCodePointCount[i] = prevWordsInfo.mPrevWordCodePointCount[i]; + memmove(mPrevWordCodePoints[i], prevWordsInfo.mPrevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]); + mIsBeginningOfSentence[i] = prevWordsInfo.mIsBeginningOfSentence[i]; + } + } + + // Construct from previous words. + PrevWordsInfo(const int prevWordCodePoints[][MAX_WORD_LENGTH], + const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence, + const size_t prevWordCount) { + clear(); + for (size_t i = 0; i < std::min(NELEMS(mPrevWordCodePoints), prevWordCount); ++i) { + if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) { + continue; + } + memmove(mPrevWordCodePoints[i], prevWordCodePoints[i], + sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]); + mPrevWordCodePointCount[i] = prevWordCodePointCount[i]; + mIsBeginningOfSentence[i] = isBeginningOfSentence[i]; + } + } + + // Construct from a previous word. PrevWordsInfo(const int *const prevWordCodePoints, const int prevWordCodePointCount, const bool isBeginningOfSentence) { clear(); - mPrevWordCodePoints[0] = prevWordCodePoints; + if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) { + return; + } + memmove(mPrevWordCodePoints[0], prevWordCodePoints, + sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount); mPrevWordCodePointCount[0] = prevWordCodePointCount; mIsBeginningOfSentence[0] = isBeginningOfSentence; } bool isValid() const { - for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - if (mPrevWordCodePointCount[i] > MAX_WORD_LENGTH) { - return false; - } + if (mPrevWordCodePointCount[0] > 0) { + return true; + } + if (mIsBeginningOfSentence[0]) { + return true; } - return true; + return false; } void getPrevWordsTerminalPtNodePos( @@ -168,13 +198,12 @@ class PrevWordsInfo { void clear() { for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) { - mPrevWordCodePoints[i] = nullptr; mPrevWordCodePointCount[i] = 0; mIsBeginningOfSentence[i] = false; } } - const int *mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + int mPrevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; int mPrevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; bool mIsBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; }; diff --git a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp index 5c62b9caf..002593c49 100644 --- a/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp +++ b/native/jni/src/suggest/policyimpl/dictionary/structure/v2/patricia_trie_policy.cpp @@ -268,6 +268,10 @@ int PatriciaTriePolicy::getTerminalPtNodePositionOfWord(const int *const inWord, int PatriciaTriePolicy::getProbability(const int unigramProbability, const int bigramProbability) const { + // Due to space constraints, the probability for bigrams is approximate - the lower the unigram + // probability, the worse the precision. The theoritical maximum error in resulting probability + // is 8 - although in the practice it's never bigger than 3 or 4 in very bad cases. This means + // that sometimes, we'll see some bigrams interverted here, but it can't get too bad. if (unigramProbability == NOT_A_PROBABILITY) { return NOT_A_PROBABILITY; } else if (bigramProbability == NOT_A_PROBABILITY) { diff --git a/native/jni/src/utils/jni_data_utils.h b/native/jni/src/utils/jni_data_utils.h index 3514aeeb0..cb82d3c3b 100644 --- a/native/jni/src/utils/jni_data_utils.h +++ b/native/jni/src/utils/jni_data_utils.h @@ -21,6 +21,7 @@ #include "defines.h" #include "jni.h" +#include "suggest/core/session/prev_words_info.h" #include "suggest/core/policy/dictionary_header_structure_policy.h" #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" #include "utils/char_utils.h" @@ -95,6 +96,37 @@ class JniDataUtils { } } + static PrevWordsInfo constructPrevWordsInfo(JNIEnv *env, jobjectArray prevWordCodePointArrays, + jbooleanArray isBeginningOfSentenceArray) { + int prevWordCodePoints[MAX_PREV_WORD_COUNT_FOR_N_GRAM][MAX_WORD_LENGTH]; + int prevWordCodePointCount[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + bool isBeginningOfSentence[MAX_PREV_WORD_COUNT_FOR_N_GRAM]; + jsize prevWordsCount = env->GetArrayLength(prevWordCodePointArrays); + for (size_t i = 0; i < NELEMS(prevWordCodePoints); ++i) { + prevWordCodePointCount[i] = 0; + isBeginningOfSentence[i] = false; + if (prevWordsCount <= static_cast<int>(i)) { + continue; + } + jintArray prevWord = (jintArray)env->GetObjectArrayElement(prevWordCodePointArrays, i); + if (!prevWord) { + continue; + } + jsize prevWordLength = env->GetArrayLength(prevWord); + if (prevWordLength > MAX_WORD_LENGTH) { + continue; + } + env->GetIntArrayRegion(prevWord, 0, prevWordLength, prevWordCodePoints[i]); + prevWordCodePointCount[i] = prevWordLength; + jboolean isBeginningOfSentenceBoolean = JNI_FALSE; + env->GetBooleanArrayRegion(isBeginningOfSentenceArray, i, 1 /* len */, + &isBeginningOfSentenceBoolean); + isBeginningOfSentence[i] = isBeginningOfSentenceBoolean == JNI_TRUE; + } + return PrevWordsInfo(prevWordCodePoints, prevWordCodePointCount, isBeginningOfSentence, + MAX_PREV_WORD_COUNT_FOR_N_GRAM); + } + static void putBooleanToArray(JNIEnv *env, jbooleanArray array, const int index, const jboolean value) { env->SetBooleanArrayRegion(array, index, 1 /* len */, &value); |