diff options
-rw-r--r-- | native/src/bigram_dictionary.h | 4 | ||||
-rw-r--r-- | native/src/binary_format.h | 4 | ||||
-rw-r--r-- | native/src/correction.h | 8 | ||||
-rw-r--r-- | native/src/dictionary.h | 4 | ||||
-rw-r--r-- | native/src/proximity_info.h | 4 | ||||
-rw-r--r-- | native/src/terminal_attributes.h | 78 | ||||
-rw-r--r-- | native/src/unigram_dictionary.cpp | 26 | ||||
-rw-r--r-- | native/src/unigram_dictionary.h | 11 | ||||
-rw-r--r-- | native/src/words_priority_queue.h | 5 | ||||
-rw-r--r-- | native/src/words_priority_queue_pool.h | 5 | ||||
-rw-r--r-- | tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java | 120 | ||||
-rw-r--r-- | tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java | 3 | ||||
-rw-r--r-- | tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java | 55 | ||||
-rw-r--r-- | tools/makedict/src/com/android/inputmethod/latin/Word.java | 7 | ||||
-rw-r--r-- | tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java | 3 | ||||
-rw-r--r-- | tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java | 10 |
16 files changed, 278 insertions, 69 deletions
diff --git a/native/src/bigram_dictionary.h b/native/src/bigram_dictionary.h index c07458a38..585a1866a 100644 --- a/native/src/bigram_dictionary.h +++ b/native/src/bigram_dictionary.h @@ -21,14 +21,14 @@ namespace latinime { class Dictionary; class BigramDictionary { -public: + public: BigramDictionary(const unsigned char *dict, int maxWordLength, int maxAlternatives, const bool isLatestDictVersion, const bool hasBigram, Dictionary *parentDictionary); int getBigrams(unsigned short *word, int length, int *codes, int codesSize, unsigned short *outWords, int *frequencies, int maxWordLength, int maxBigrams, int maxAlternatives); ~BigramDictionary(); -private: + private: bool addWordBigram(unsigned short *word, int length, int frequency); int getBigramAddress(int *pos, bool advance); int getBigramFreq(int *pos); diff --git a/native/src/binary_format.h b/native/src/binary_format.h index cbaccb295..9944fa2bd 100644 --- a/native/src/binary_format.h +++ b/native/src/binary_format.h @@ -22,12 +22,12 @@ namespace latinime { class BinaryFormat { -private: + private: const static int32_t MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20; const static int32_t CHARACTER_ARRAY_TERMINATOR = 0x1F; const static int MULTIPLE_BYTE_CHARACTER_ADDITIONAL_SIZE = 2; -public: + public: const static int UNKNOWN_FORMAT = -1; const static int FORMAT_VERSION_1 = 1; const static uint16_t FORMAT_VERSION_1_MAGIC_NUMBER = 0x78B1; diff --git a/native/src/correction.h b/native/src/correction.h index e55be8dd6..9ba472955 100644 --- a/native/src/correction.h +++ b/native/src/correction.h @@ -27,8 +27,7 @@ namespace latinime { class ProximityInfo; class Correction { - -public: + public: typedef enum { TRAVERSE_ALL_ON_TERMINAL, TRAVERSE_ALL_NOT_ON_TERMINAL, @@ -95,7 +94,8 @@ public: inline int getTreeParentIndex(const int index) const { return mCorrectionStates[index].mParentIndex; } -private: + + private: inline void incrementInputIndex(); inline void incrementOutputIndex(); inline bool needsToTraverseAllNodes(); @@ -154,7 +154,7 @@ private: bool mSkipping; class RankingAlgorithm { - public: + public: static int calculateFinalFreq(const int inputIndex, const int depth, const int freq, int *editDistanceTable, const Correction* correction); static int calcFreqForSplitTwoWords(const int firstFreq, const int secondFreq, diff --git a/native/src/dictionary.h b/native/src/dictionary.h index 52048ecca..79d377a4f 100644 --- a/native/src/dictionary.h +++ b/native/src/dictionary.h @@ -28,7 +28,7 @@ namespace latinime { class Dictionary { -public: + public: Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives); @@ -67,7 +67,7 @@ public: const int pos, unsigned short *c, int *childrenPosition, bool *terminal, int *freq); -private: + private: bool hasBigram(); const unsigned char *mDict; diff --git a/native/src/proximity_info.h b/native/src/proximity_info.h index 832db1062..9ca5505a7 100644 --- a/native/src/proximity_info.h +++ b/native/src/proximity_info.h @@ -26,7 +26,7 @@ namespace latinime { class Correction; class ProximityInfo { -public: + public: static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2 = 10; static const int NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR = 1 << NORMALIZED_SQUARED_DISTANCE_SCALING_FACTOR_LOG_2; @@ -68,7 +68,7 @@ public: return mTouchPositionCorrectionEnabled; } -private: + private: // The max number of the keys in one keyboard layout static const int MAX_KEY_COUNT_IN_A_KEYBOARD = 64; // The upper limit of the char code in mCodeToKeyIndex diff --git a/native/src/terminal_attributes.h b/native/src/terminal_attributes.h new file mode 100644 index 000000000..1f9815936 --- /dev/null +++ b/native/src/terminal_attributes.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2012 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LATINIME_TERMINAL_ATTRIBUTES_H +#define LATINIME_TERMINAL_ATTRIBUTES_H + +#include "unigram_dictionary.h" + +namespace latinime { + +/** + * This class encapsulates information about a terminal that allows to + * retrieve local node attributes like the list of shortcuts without + * exposing the format structure to the client. + */ +class TerminalAttributes { + public: + class ShortcutIterator { + const uint8_t* const mDict; + bool mHasNextShortcutTarget; + int mPos; + + public: + ShortcutIterator(const uint8_t* dict, const int pos, const uint8_t flags) : mDict(dict), + mPos(pos) { + mHasNextShortcutTarget = (0 != (flags & UnigramDictionary::FLAG_HAS_SHORTCUT_TARGETS)); + } + + inline bool hasNextShortcutTarget() const { + return mHasNextShortcutTarget; + } + + // Gets the shortcut target itself as a uint16_t string. For parameters and return value + // see BinaryFormat::getWordAtAddress. + inline int getNextShortcutTarget(const int maxDepth, uint16_t* outWord) { + const int shortcutFlags = BinaryFormat::getFlagsAndForwardPointer(mDict, &mPos); + mHasNextShortcutTarget = + 0 != (shortcutFlags & UnigramDictionary::FLAG_ATTRIBUTE_HAS_NEXT); + int shortcutAddress = + BinaryFormat::getAttributeAddressAndForwardPointer(mDict, shortcutFlags, &mPos); + return BinaryFormat::getWordAtAddress(mDict, shortcutAddress, maxDepth, outWord); + } + }; + + private: + const uint8_t* const mDict; + const uint8_t mFlags; + const int mStartPos; + + public: + TerminalAttributes(const uint8_t* const dict, const uint8_t flags, const int pos) : + mDict(dict), mFlags(flags), mStartPos(pos) { + } + + inline bool isShortcutOnly() const { + return 0 != (mFlags & UnigramDictionary::FLAG_IS_SHORTCUT_ONLY); + } + + inline ShortcutIterator getShortcutIterator() const { + return ShortcutIterator(mDict, mStartPos, mFlags); + } +}; +} // namespace latinime + +#endif // LATINIME_TERMINAL_ATTRIBUTES_H diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp index 7c3c35e40..e95e03ce5 100644 --- a/native/src/unigram_dictionary.cpp +++ b/native/src/unigram_dictionary.cpp @@ -25,6 +25,7 @@ #include "unigram_dictionary.h" #include "binary_format.h" +#include "terminal_attributes.h" namespace latinime { @@ -324,13 +325,28 @@ void UnigramDictionary::getMistypedSpaceWords(ProximityInfo *proximityInfo, cons correction, queuePool); } -inline void UnigramDictionary::onTerminal( - const int freq, Correction *correction, WordsPriorityQueue *queue) { +inline void UnigramDictionary::onTerminal(const int freq, + const TerminalAttributes& terminalAttributes, Correction *correction, + WordsPriorityQueue *queue) { int wordLength; unsigned short* wordPointer; const int finalFreq = correction->getFinalFreq(freq, &wordPointer, &wordLength); if (finalFreq >= 0) { - addWord(wordPointer, wordLength, finalFreq, queue); + if (!terminalAttributes.isShortcutOnly()) { + addWord(wordPointer, wordLength, finalFreq, queue); + } + TerminalAttributes::ShortcutIterator iterator = terminalAttributes.getShortcutIterator(); + while (iterator.hasNextShortcutTarget()) { + // TODO: addWord only supports weak ordering, meaning we have no means to control the + // order of the shortcuts relative to one another or to the word. We need to either + // modulate the frequency of each shortcut according to its own shortcut frequency or + // to make the queue so that the insert order is protected inside the queue for words + // with the same score. + uint16_t shortcutTarget[MAX_WORD_LENGTH_INTERNAL]; + const int shortcutTargetStringLength = iterator.getNextShortcutTarget( + MAX_WORD_LENGTH_INTERNAL, shortcutTarget); + addWord(shortcutTarget, shortcutTargetStringLength, finalFreq, queue); + } } } @@ -646,7 +662,9 @@ inline bool UnigramDictionary::processCurrentNode(const int initialPos, // The frequency should be here, because we come here only if this is actually // a terminal node, and we are on its last char. const int freq = BinaryFormat::readFrequencyWithoutMovingPointer(DICT_ROOT, pos); - onTerminal(freq, correction, queue); + TerminalAttributes terminalAttributes(DICT_ROOT, flags, + BinaryFormat::skipFrequency(flags, pos)); + onTerminal(freq, terminalAttributes, correction, queue); } // If there are more chars in this node, then this virtual node has children. diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h index afe92e5b9..23581425a 100644 --- a/native/src/unigram_dictionary.h +++ b/native/src/unigram_dictionary.h @@ -27,10 +27,9 @@ namespace latinime { +class TerminalAttributes; class UnigramDictionary { - -public: - + public: // Mask and flags for children address type selection. static const int MASK_GROUP_ADDRESS_TYPE = 0xC0; static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00; @@ -83,8 +82,7 @@ public: unsigned short *outWords, int *frequencies); virtual ~UnigramDictionary(); -private: - + private: void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates, const int *codes, const int inputLength, const int flags, Correction *correction, WordsPriorityQueuePool *queuePool); @@ -115,7 +113,8 @@ private: const int *ycoordinates, const int *codes, const bool useFullEditDistance, const int inputLength, const int spaceProximityPos, Correction *correction, WordsPriorityQueuePool* queuePool); - void onTerminal(const int freq, Correction *correction, WordsPriorityQueue *queue); + void onTerminal(const int freq, const TerminalAttributes& terminalAttributes, + Correction *correction, WordsPriorityQueue *queue); bool needsToSkipCurrentNode(const unsigned short c, const int inputIndex, const int skipPos, const int depth); // Process a node by considering proximity, missing and excessive character diff --git a/native/src/words_priority_queue.h b/native/src/words_priority_queue.h index 2d6270977..84f2523c2 100644 --- a/native/src/words_priority_queue.h +++ b/native/src/words_priority_queue.h @@ -24,7 +24,7 @@ namespace latinime { class WordsPriorityQueue { -public: + public: class SuggestedWord { public: int mScore; @@ -126,7 +126,8 @@ public: mSuggestions.pop(); } } -private: + + private: struct wordComparator { bool operator ()(SuggestedWord * left, SuggestedWord * right) { return left->mScore > right->mScore; diff --git a/native/src/words_priority_queue_pool.h b/native/src/words_priority_queue_pool.h index d964bfc3b..386297650 100644 --- a/native/src/words_priority_queue_pool.h +++ b/native/src/words_priority_queue_pool.h @@ -22,7 +22,7 @@ namespace latinime { class WordsPriorityQueuePool { -public: + public: WordsPriorityQueuePool(int mainQueueMaxWords, int subQueueMaxWords, int maxWordLength) { mMasterQueue = new WordsPriorityQueue(mainQueueMaxWords, maxWordLength); mSubQueue1 = new WordsPriorityQueue(subQueueMaxWords, maxWordLength); @@ -43,7 +43,8 @@ public: WordsPriorityQueue* getSubQueue2() { return mSubQueue2; } -private: + + private: WordsPriorityQueue *mMasterQueue; WordsPriorityQueue *mSubQueue1; WordsPriorityQueue *mSubQueue2; diff --git a/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java index 92f402d3e..b7826f065 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/BinaryDictInputOutput.java @@ -26,6 +26,7 @@ import java.io.OutputStream; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.TreeMap; @@ -44,8 +45,9 @@ public class BinaryDictInputOutput { * a | 11 = 3 bytes : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES * g | has several chars ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_MULTIPLE_CHARS * s | has a terminal ? 1 bit, 1 = yes, 0 = no : FLAG_IS_TERMINAL - * | reserved 1 bit, 1 = yes, 0 = no + * | has shortcut targets ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_SHORTCUT_TARGETS * | has bigrams ? 1 bit, 1 = yes, 0 = no : FLAG_HAS_BIGRAMS + * | is shortcut only ? 1 bit, 1 = yes, 0 = no : FLAG_IS_SHORTCUT_ONLY * * c | IF FLAG_HAS_MULTIPLE_CHARS * h | char, char, char, char n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers @@ -71,6 +73,8 @@ public class BinaryDictInputOutput { * d * dress * + * | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS + * | shortcut targets address list * | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS * | bigrams address list * @@ -126,7 +130,9 @@ public class BinaryDictInputOutput { private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20; private static final int FLAG_IS_TERMINAL = 0x10; + private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08; private static final int FLAG_HAS_BIGRAMS = 0x04; + private static final int FLAG_IS_SHORTCUT_ONLY = 0x02; private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80; private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40; @@ -271,10 +277,13 @@ public class BinaryDictInputOutput { // If terminal, one byte for the frequency if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE; size += GROUP_MAX_ADDRESS_SIZE; // For children address + if (null != group.mShortcutTargets) { + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mShortcutTargets.size(); + } if (null != group.mBigrams) { - for (WeightedString bigram : group.mBigrams) { - size += GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE; - } + size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE) + * group.mBigrams.size(); } return size; } @@ -387,6 +396,15 @@ public class BinaryDictInputOutput { final int offset = group.mChildren.mCachedAddress - offsetBasePoint; groupSize += getByteSize(offset); } + if (null != group.mShortcutTargets) { + for (WeightedString target : group.mShortcutTargets) { + final int offsetBasePoint = groupSize + node.mCachedAddress + size + + GROUP_FLAGS_SIZE; + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + final int offset = addressOfTarget - offsetBasePoint; + groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE; + } + } if (null != group.mBigrams) { for (WeightedString bigram : group.mBigrams) { final int offsetBasePoint = groupSize + node.mCachedAddress + size @@ -545,7 +563,19 @@ public class BinaryDictInputOutput { throw new RuntimeException("Node with a strange address"); } } - if (null != group.mBigrams) flags |= FLAG_HAS_BIGRAMS; + if (null != group.mShortcutTargets) { + if (0 == group.mShortcutTargets.size()) { + throw new RuntimeException("0-sized shortcut list must be null"); + } + flags |= FLAG_HAS_SHORTCUT_TARGETS; + } + if (null != group.mBigrams) { + if (0 == group.mBigrams.size()) { + throw new RuntimeException("0-sized bigram list must be null"); + } + flags |= FLAG_HAS_BIGRAMS; + } + // TODO: fill in the FLAG_IS_SHORTCUT_ONLY return flags; } @@ -624,20 +654,36 @@ public class BinaryDictInputOutput { index += shift; groupAddress += shift; + // Write shortcuts + if (null != group.mShortcutTargets) { + final Iterator shortcutIterator = group.mShortcutTargets.iterator(); + while (shortcutIterator.hasNext()) { + final WeightedString target = (WeightedString)shortcutIterator.next(); + final int addressOfTarget = findAddressOfWord(dict, target.mWord); + ++groupAddress; + final int offset = addressOfTarget - groupAddress; + int shortcutFlags = makeAttributeFlags(shortcutIterator.hasNext(), offset, + target.mFrequency); + buffer[index++] = (byte)shortcutFlags; + final int shortcutShift = writeVariableAddress(buffer, index, Math.abs(offset)); + index += shortcutShift; + groupAddress += shortcutShift; + } + } // Write bigrams if (null != group.mBigrams) { - int remainingBigrams = group.mBigrams.size(); - for (WeightedString bigram : group.mBigrams) { - boolean more = remainingBigrams > 1; + final Iterator bigramIterator = group.mBigrams.iterator(); + while (bigramIterator.hasNext()) { + final WeightedString bigram = (WeightedString)bigramIterator.next(); final int addressOfBigram = findAddressOfWord(dict, bigram.mWord); ++groupAddress; final int offset = addressOfBigram - groupAddress; - int bigramFlags = makeAttributeFlags(more, offset, bigram.mFrequency); + int bigramFlags = makeAttributeFlags(bigramIterator.hasNext(), offset, + bigram.mFrequency); buffer[index++] = (byte)bigramFlags; final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset)); index += bigramShift; groupAddress += bigramShift; - --remainingBigrams; } } @@ -814,14 +860,43 @@ public class BinaryDictInputOutput { childrenAddress = NO_CHILDREN_ADDRESS; break; } + ArrayList<PendingAttribute> shortcutTargets = null; + if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) { + shortcutTargets = new ArrayList<PendingAttribute>(); + while (true) { + final int targetFlags = source.readUnsignedByte(); + ++addressPointer; + final int sign = 0 == (targetFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; + int targetAddress = addressPointer; + switch (targetFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { + case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE: + targetAddress += sign * source.readUnsignedByte(); + addressPointer += 1; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES: + targetAddress += sign * source.readUnsignedShort(); + addressPointer += 2; + break; + case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES: + final int offset = ((source.readUnsignedByte() << 16) + + source.readUnsignedShort()); + targetAddress += sign * offset; + addressPointer += 3; + break; + default: + throw new RuntimeException("Has attribute with no address"); + } + shortcutTargets.add(new PendingAttribute(targetFlags & FLAG_ATTRIBUTE_FREQUENCY, + targetAddress)); + if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; + } + } ArrayList<PendingAttribute> bigrams = null; if (0 != (flags & FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); - boolean more = true; - while (more) { - int bigramFlags = source.readUnsignedByte(); + while (true) { + final int bigramFlags = source.readUnsignedByte(); ++addressPointer; - more = (0 != (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)); final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1; int bigramAddress = addressPointer; switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) { @@ -844,10 +919,11 @@ public class BinaryDictInputOutput { } bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY, bigramAddress)); + if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break; } } return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency, - childrenAddress, bigrams); + childrenAddress, shortcutTargets, bigrams); } /** @@ -925,6 +1001,14 @@ public class BinaryDictInputOutput { int groupOffset = nodeOrigin + 1; // 1 byte for the group count for (int i = count; i > 0; --i) { CharGroupInfo info = readCharGroup(source, groupOffset); + ArrayList<WeightedString> shortcutTargets = null; + if (null != info.mShortcutTargets) { + shortcutTargets = new ArrayList<WeightedString>(); + for (PendingAttribute target : info.mShortcutTargets) { + final String word = getWordAtAddress(source, headerSize, target.mAddress); + shortcutTargets.add(new WeightedString(word, target.mFrequency)); + } + } ArrayList<WeightedString> bigrams = null; if (null != info.mBigrams) { bigrams = new ArrayList<WeightedString>(); @@ -942,11 +1026,11 @@ public class BinaryDictInputOutput { source.seek(currentPosition); } nodeContents.add( - new CharGroup(info.mCharacters, bigrams, info.mFrequency, + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency, children)); } else { nodeContents.add( - new CharGroup(info.mCharacters, bigrams, info.mFrequency)); + new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency)); } groupOffset = info.mEndAddress; } @@ -996,7 +1080,7 @@ public class BinaryDictInputOutput { new FusionDictionary.DictionaryOptions()); if (null != dict) { for (Word w : dict) { - newDict.add(w.mWord, w.mFrequency, w.mBigrams); + newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets, w.mBigrams); } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java b/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java index 6badfd13a..759cd452d 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java +++ b/tools/makedict/src/com/android/inputmethod/latin/CharGroupInfo.java @@ -29,10 +29,12 @@ public class CharGroupInfo { public final int[] mCharacters; public final int mFrequency; public final int mChildrenAddress; + public final ArrayList<PendingAttribute> mShortcutTargets; public final ArrayList<PendingAttribute> mBigrams; public CharGroupInfo(final int originalAddress, final int endAddress, final int flags, final int[] characters, final int frequency, final int childrenAddress, + final ArrayList<PendingAttribute> shortcutTargets, final ArrayList<PendingAttribute> bigrams) { mOriginalAddress = originalAddress; mEndAddress = endAddress; @@ -40,6 +42,7 @@ public class CharGroupInfo { mCharacters = characters; mFrequency = frequency; mChildrenAddress = childrenAddress; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java b/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java index f6220eea2..50def5e62 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java +++ b/tools/makedict/src/com/android/inputmethod/latin/FusionDictionary.java @@ -68,7 +68,7 @@ public class FusionDictionary implements Iterable<Word> { } /** - * A group of characters, with a frequency, shortcuts, bigrams, and children. + * A group of characters, with a frequency, shortcut targets, bigrams, and children. * * This is the central class of the in-memory representation. A CharGroup is what can * be seen as a traditional "trie node", except it can hold several characters at the @@ -82,6 +82,7 @@ public class FusionDictionary implements Iterable<Word> { public static class CharGroup { public static final int NOT_A_TERMINAL = -1; final int mChars[]; + final ArrayList<WeightedString> mShortcutTargets; final ArrayList<WeightedString> mBigrams; final int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. Node mChildren; @@ -89,18 +90,20 @@ public class FusionDictionary implements Iterable<Word> { int mCachedSize; int mCachedAddress; - public CharGroup(final int[] chars, + public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency) { mChars = chars; mFrequency = frequency; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = null; } - public CharGroup(final int[] chars, + public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final Node children) { mChars = chars; mFrequency = frequency; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; mChildren = children; } @@ -165,18 +168,29 @@ public class FusionDictionary implements Iterable<Word> { * * @param word the word to add. * @param frequency the frequency of the word, in the range [0..255]. + * @param shortcutTargets a list of shortcut targets for this word, or null. * @param bigrams a list of bigrams, or null. */ - public void add(String word, int frequency, ArrayList<WeightedString> bigrams) { + public void add(final String word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams) { + if (null != shortcutTargets) { + for (WeightedString target : shortcutTargets) { + final CharGroup t = findWordInTree(mRoot, target.mWord); + if (null == t) { + add(getCodePoints(target.mWord), 0, null, null); + } + } + } if (null != bigrams) { for (WeightedString bigram : bigrams) { final CharGroup t = findWordInTree(mRoot, bigram.mWord); if (null == t) { - add(getCodePoints(bigram.mWord), 0, null); + add(getCodePoints(bigram.mWord), 0, null, null); } } } - add(getCodePoints(word), frequency, bigrams); + add(getCodePoints(word), frequency, shortcutTargets, bigrams); } /** @@ -200,14 +214,17 @@ public class FusionDictionary implements Iterable<Word> { /** * Add a word to this dictionary. * - * The bigrams, if any, have to be in the dictionary already. If they aren't, + * The shortcuts and bigrams, if any, have to be in the dictionary already. If they aren't, * an exception is thrown. * * @param word the word, as an int array. * @param frequency the frequency of the word, in the range [0..255]. + * @param shortcutTargets an optional list of shortcut targets for this word (null if none). * @param bigrams an optional list of bigrams for this word (null if none). */ - private void add(int[] word, int frequency, ArrayList<WeightedString> bigrams) { + private void add(final int[] word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams) { assert(frequency >= 0 && frequency <= 255); Node currentNode = mRoot; int charIndex = 0; @@ -231,7 +248,8 @@ public class FusionDictionary implements Iterable<Word> { // No node at this point to accept the word. Create one. final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); final CharGroup newGroup = new CharGroup( - Arrays.copyOfRange(word, charIndex, word.length), bigrams, frequency); + Arrays.copyOfRange(word, charIndex, word.length), + shortcutTargets, bigrams, frequency); currentNode.mData.add(insertionIndex, newGroup); checkStack(currentNode); } else { @@ -245,7 +263,7 @@ public class FusionDictionary implements Iterable<Word> { + new String(word, 0, word.length)); } else { final CharGroup newNode = new CharGroup(currentGroup.mChars, - bigrams, frequency, currentGroup.mChildren); + shortcutTargets, bigrams, frequency, currentGroup.mChildren); currentNode.mData.set(nodeIndex, newNode); checkStack(currentNode); } @@ -254,7 +272,7 @@ public class FusionDictionary implements Iterable<Word> { // We only have to create a new node and add it to the end of this. final CharGroup newNode = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), - bigrams, frequency); + shortcutTargets, bigrams, frequency); currentGroup.mChildren = new Node(); currentGroup.mChildren.mData.add(newNode); } @@ -268,7 +286,8 @@ public class FusionDictionary implements Iterable<Word> { + new String(word, 0, word.length)); } final CharGroup newGroup = new CharGroup(word, - currentGroup.mBigrams, frequency, currentGroup.mChildren); + currentGroup.mShortcutTargets, currentGroup.mBigrams, + frequency, currentGroup.mChildren); currentNode.mData.set(nodeIndex, newGroup); } } else { @@ -277,7 +296,7 @@ public class FusionDictionary implements Iterable<Word> { Node newChildren = new Node(); final CharGroup newOldWord = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, - currentGroup.mChars.length), + currentGroup.mChars.length), currentGroup.mShortcutTargets, currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren); newChildren.mData.add(newOldWord); @@ -285,14 +304,14 @@ public class FusionDictionary implements Iterable<Word> { if (charIndex + differentCharIndex >= word.length) { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - bigrams, frequency, newChildren); + shortcutTargets, bigrams, frequency, newChildren); } else { newParent = new CharGroup( Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), - null, -1, newChildren); + null, null, -1, newChildren); final CharGroup newWord = new CharGroup( Arrays.copyOfRange(word, charIndex + differentCharIndex, - word.length), bigrams, frequency); + word.length), shortcutTargets, bigrams, frequency); final int addIndex = word[charIndex + differentCharIndex] > currentGroup.mChars[differentCharIndex] ? 1 : 0; newChildren.mData.add(addIndex, newWord); @@ -355,7 +374,7 @@ public class FusionDictionary implements Iterable<Word> { */ private static int findInsertionIndex(final Node node, int character) { final List data = node.mData; - final CharGroup reference = new CharGroup(new int[] { character }, null, 0); + final CharGroup reference = new CharGroup(new int[] { character }, null, null, 0); int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); return result >= 0 ? result : -result - 1; } @@ -573,7 +592,7 @@ public class FusionDictionary implements Iterable<Word> { } if (currentGroup.mFrequency >= 0) return new Word(mCurrentString.toString(), currentGroup.mFrequency, - currentGroup.mBigrams); + currentGroup.mShortcutTargets, currentGroup.mBigrams); } else { mPositions.removeLast(); currentPos = mPositions.getLast(); diff --git a/tools/makedict/src/com/android/inputmethod/latin/Word.java b/tools/makedict/src/com/android/inputmethod/latin/Word.java index 916165a41..561b21bb3 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/Word.java +++ b/tools/makedict/src/com/android/inputmethod/latin/Word.java @@ -28,11 +28,15 @@ import java.util.ArrayList; public class Word implements Comparable<Word> { final String mWord; final int mFrequency; + final ArrayList<WeightedString> mShortcutTargets; final ArrayList<WeightedString> mBigrams; - public Word(String word, int frequency, ArrayList<WeightedString> bigrams) { + public Word(final String word, final int frequency, + final ArrayList<WeightedString> shortcutTargets, + final ArrayList<WeightedString> bigrams) { mWord = word; mFrequency = frequency; + mShortcutTargets = shortcutTargets; mBigrams = bigrams; } @@ -60,6 +64,7 @@ public class Word implements Comparable<Word> { if (!(o instanceof Word)) return false; Word w = (Word)o; return mFrequency == w.mFrequency && mWord.equals(w.mWord) + && mShortcutTargets.equals(w.mShortcutTargets) && mBigrams.equals(w.mBigrams); } } diff --git a/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java b/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java index 4720e9d10..19ed9d8d2 100644 --- a/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java +++ b/tools/makedict/src/com/android/inputmethod/latin/XmlDictInputOutput.java @@ -107,7 +107,8 @@ public class XmlDictInputOutput { @Override public void endElement(String uri, String localName, String qName) { if (WORD == mState) { - mDictionary.add(mWord, mFreq, mBigramsMap.get(mWord)); + // TODO: pass the shortcut targets + mDictionary.add(mWord, mFreq, null, mBigramsMap.get(mWord)); mState = START; } } diff --git a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java index 79cf14b2b..6ac046bbf 100644 --- a/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java +++ b/tools/makedict/tests/com/android/inputmethod/latin/BinaryDictInputOutputTest.java @@ -39,11 +39,11 @@ public class BinaryDictInputOutputTest extends TestCase { // that it does not contain any duplicates. public void testFlattenNodes() { final FusionDictionary dict = new FusionDictionary(); - dict.add("foo", 1, null); - dict.add("fta", 1, null); - dict.add("ftb", 1, null); - dict.add("bar", 1, null); - dict.add("fool", 1, null); + dict.add("foo", 1, null, null); + dict.add("fta", 1, null, null); + dict.add("ftb", 1, null, null); + dict.add("bar", 1, null, null); + dict.add("fool", 1, null, null); final ArrayList<Node> result = BinaryDictInputOutput.flattenTree(dict.mRoot); assertEquals(4, result.size()); while (!result.isEmpty()) { |