aboutsummaryrefslogtreecommitdiffstats
path: root/native/src/unigram_dictionary.h
blob: 396a81149ab6a6938550fb6c7526446bf2e3c1d8 (about) (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/*
 * Copyright (C) 2010 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LATINIME_UNIGRAM_DICTIONARY_H
#define LATINIME_UNIGRAM_DICTIONARY_H

#include <stdint.h>
#include "correction.h"
#include "correction_state.h"
#include "defines.h"
#include "proximity_info.h"
#include "words_priority_queue.h"
#include "words_priority_queue_pool.h"

namespace latinime {

class TerminalAttributes;
class UnigramDictionary {
 public:
    // Mask and flags for children address type selection.
    static const int MASK_GROUP_ADDRESS_TYPE = 0xC0;
    static const int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
    static const int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
    static const int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
    static const int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;

    // Flag for single/multiple char group
    static const int FLAG_HAS_MULTIPLE_CHARS = 0x20;

    // Flag for terminal groups
    static const int FLAG_IS_TERMINAL = 0x10;

    // Flag for shortcut targets presence
    static const int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
    // Flag for bigram presence
    static const int FLAG_HAS_BIGRAMS = 0x04;
    // Flag for shortcut-only words. Some words are shortcut-only, which means they match when
    // the user types them but they don't pop in the suggestion strip, only the words they are
    // shortcuts for do.
    static const int FLAG_IS_SHORTCUT_ONLY = 0x02;

    // Attribute (bigram/shortcut) related flags:
    // Flag for presence of more attributes
    static const int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
    // Flag for sign of offset. If this flag is set, the offset value must be negated.
    static const int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;

    // Mask for attribute frequency, stored on 4 bits inside the flags byte.
    static const int MASK_ATTRIBUTE_FREQUENCY = 0x0F;

    // Mask and flags for attribute address type selection.
    static const int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
    static const int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;

    // Error tolerances
    static const int DEFAULT_MAX_ERRORS = 2;
    static const int MAX_ERRORS_FOR_TWO_WORDS = 1;

    UnigramDictionary(const uint8_t* const streamStart, int typedLetterMultipler,
            int fullWordMultiplier, int maxWordLength, int maxWords, int maxProximityChars,
            const bool isLatestDictVersion);
    bool isValidWord(const uint16_t* const inWord, const int length) const;
    int getBigramPosition(int pos, unsigned short *word, int offset, int length) const;
    int getSuggestions(ProximityInfo *proximityInfo, WordsPriorityQueuePool *queuePool,
            Correction *correction, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize, const int flags,
            unsigned short *outWords, int *frequencies);
    virtual ~UnigramDictionary();

 private:
    void getWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int inputLength,
            const int flags, Correction *correction, WordsPriorityQueuePool *queuePool);
    bool isDigraph(const int *codes, const int i, const int codesSize) const;
    void getWordWithDigraphSuggestionsRec(ProximityInfo *proximityInfo,
        const int *xcoordinates, const int* ycoordinates, const int *codesBuffer,
        const int codesBufferSize, const int flags, const int* codesSrc,
        const int codesRemain, const int currentDepth, int* codesDest, Correction *correction,
        WordsPriorityQueuePool* queuePool);
    void initSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const int codesSize, Correction *correction);
    void getOneWordSuggestions(ProximityInfo *proximityInfo, const int *xcoordinates,
            const int *ycoordinates, const int *codes, const bool useFullEditDistance,
            const int inputLength, Correction *correction, WordsPriorityQueuePool* queuePool);
    void getSuggestionCandidates(
            const bool useFullEditDistance, const int inputLength, Correction *correction,
            WordsPriorityQueuePool* queuePool, const bool doAutoCompletion, const int maxErrors,
            const int currentWordIndex);
    void getSplitMultipleWordsSuggestions(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength,
            Correction *correction, WordsPriorityQueuePool* queuePool,
            const bool hasAutoCorrectionCandidate);
    void onTerminal(const int freq, const TerminalAttributes& terminalAttributes,
            Correction *correction, WordsPriorityQueuePool *queuePool, const bool addToMasterQueue,
            const int currentWordIndex);
    bool needsToSkipCurrentNode(const unsigned short c,
            const int inputIndex, const int skipPos, const int depth);
    // Process a node by considering proximity, missing and excessive character
    bool processCurrentNode(const int initialPos, Correction *correction, int *newCount,
            int *newChildPosition, int *nextSiblingPosition, WordsPriorityQueuePool *queuePool,
            const int currentWordIndex);
    int getMostFrequentWordLike(const int startInputIndex, const int inputLength,
            ProximityInfo *proximityInfo, unsigned short *word);
    int getMostFrequentWordLikeInner(const uint16_t* const inWord, const int length,
            short unsigned int *outWord);
    bool getSubStringSuggestion(
            ProximityInfo *proximityInfo, const int *xcoordinates, const int *ycoordinates,
            const int *codes, const bool useFullEditDistance, Correction *correction,
            WordsPriorityQueuePool* queuePool, const int inputLength,
            const bool hasAutoCorrectionCandidate, const int currentWordIndex,
            const int inputWordStartPos, const int inputWordLength,
            const int outputWordStartPos, const bool isSpaceProximity, int *freqArray,
            int *wordLengthArray, unsigned short* outputWord, int *outputWordLength);
    void getMultiWordsSuggestionRec(ProximityInfo *proximityInfo,
            const int *xcoordinates, const int *ycoordinates, const int *codes,
            const bool useFullEditDistance, const int inputLength,
            Correction *correction, WordsPriorityQueuePool* queuePool,
            const bool hasAutoCorrectionCandidate, const int startPos, const int startWordIndex,
            const int outputWordLength, int *freqArray, int* wordLengthArray,
            unsigned short* outputWord);

    const uint8_t* const DICT_ROOT;
    const int MAX_WORD_LENGTH;
    const int MAX_WORDS;
    const int MAX_PROXIMITY_CHARS;
    const bool IS_LATEST_DICT_VERSION;
    const int TYPED_LETTER_MULTIPLIER;
    const int FULL_WORD_MULTIPLIER;
    const int ROOT_POS;
    const unsigned int BYTES_IN_ONE_CHAR;
    const int MAX_UMLAUT_SEARCH_DEPTH;

    // Flags for special processing
    // Those *must* match the flags in BinaryDictionary.Flags.ALL_FLAGS in BinaryDictionary.java
    // or something very bad (like, the apocalypse) will happen.
    // Please update both at the same time.
    enum {
        REQUIRES_GERMAN_UMLAUT_PROCESSING = 0x1,
        USE_FULL_EDIT_DISTANCE = 0x2
    };
    static const struct digraph_t { int first; int second; } GERMAN_UMLAUT_DIGRAPHS[];

    // Still bundled members
    unsigned short mWord[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    int mStackChildCount[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    int mStackInputIndex[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
    int mStackSiblingPos[MAX_WORD_LENGTH_INTERNAL];// TODO: remove
};
} // namespace latinime

#endif // LATINIME_UNIGRAM_DICTIONARY_H