6 files changed, 134 insertions, 38 deletions
diff --git a/native/src/bigram_dictionary.cpp b/native/src/bigram_dictionary.cpp
index eebd69b71..5ec310f07 100644
--- a/native/src/bigram_dictionary.cpp
+++ b/native/src/bigram_dictionary.cpp
@@ -31,7 +31,7 @@ BigramDictionary::BigramDictionary(const unsigned char *dict, int maxWordLength,
     MAX_ALTERNATIVES(maxAlternatives), IS_LATEST_DICT_VERSION(isLatestDictVersion),
     HAS_BIGRAM(hasBigram), mParentDictionary(parentDictionary) {
     if (DEBUG_DICT) LOGI("BigramDictionary - constructor");
-    if (DEBUG_DICT) LOGI("Has Bigram : %d \n", hasBigram);
+    if (DEBUG_DICT) LOGI("Has Bigram : %d", hasBigram);
 }
 
 BigramDictionary::~BigramDictionary() {
@@ -42,7 +42,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
     if (DEBUG_DICT) {
         char s[length + 1];
         for (int i = 0; i <= length; i++) s[i] = word[i];
-        LOGI("Bigram: Found word = %s, freq = %d : \n", s, frequency);
+        LOGI("Bigram: Found word = %s, freq = %d :", s, frequency);
     }
 
     // Find the right insertion point
@@ -54,7 +54,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
         }
         insertAt++;
     }
-    if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d\n", insertAt, mMaxBigrams);
+    if (DEBUG_DICT) LOGI("Bigram: InsertAt -> %d maxBigrams: %d", insertAt, mMaxBigrams);
     if (insertAt < mMaxBigrams) {
         memmove((char*) mBigramFreq + (insertAt + 1) * sizeof(mBigramFreq[0]),
                (char*) mBigramFreq + insertAt * sizeof(mBigramFreq[0]),
@@ -68,7 +68,7 @@ bool BigramDictionary::addWordBigram(unsigned short *word, int length, int frequ
             *dest++ = *word++;
         }
         *dest = 0; // NULL terminate
-        if (DEBUG_DICT) LOGI("Bigram: Added word at %d\n", insertAt);
+        if (DEBUG_DICT) LOGI("Bigram: Added word at %d", insertAt);
         return true;
     }
     return false;
@@ -107,7 +107,7 @@ int BigramDictionary::getBigrams(unsigned short *prevWord, int prevWordLength, i
     if (HAS_BIGRAM && IS_LATEST_DICT_VERSION) {
         int pos = mParentDictionary->isValidWordRec(
                 DICTIONARY_HEADER_SIZE, prevWord, 0, prevWordLength);
-        if (DEBUG_DICT) LOGI("Pos -> %d\n", pos);
+        if (DEBUG_DICT) LOGI("Pos -> %d", pos);
         if (pos < 0) {
             return 0;
         }
@@ -151,7 +151,7 @@ void BigramDictionary::searchForTerminalNode(int addressLookingFor, int frequenc
         }
         pos = followDownBranchAddress; // pos start at count
         int count = DICT[pos] & 0xFF;
-        if (DEBUG_DICT) LOGI("count - %d\n",count);
+        if (DEBUG_DICT) LOGI("count - %d",count);
         pos++;
         for (int i = 0; i < count; i++) {
             // pos at data
diff --git a/native/src/defines.h b/native/src/defines.h
index 73394ce36..71aaf28ae 100644
--- a/native/src/defines.h
+++ b/native/src/defines.h
@@ -24,23 +24,88 @@
 #define LOG_TAG "LatinIME: "
 #endif
 #define DEBUG_DICT true
-#define DEBUG_DICT_FULL true
+#define DEBUG_DICT_FULL false
 #define DEBUG_SHOW_FOUND_WORD DEBUG_DICT_FULL
 #define DEBUG_NODE DEBUG_DICT_FULL
 #define DEBUG_TRACE DEBUG_DICT_FULL
+
+// Profiler
+#include <time.h>
+#define PROF_BUF_SIZE 100
+static double profile_buf[PROF_BUF_SIZE];
+static double profile_old[PROF_BUF_SIZE];
+static unsigned int profile_counter[PROF_BUF_SIZE];
+
+#define PROF_RESET               prof_reset()
+#define PROF_COUNT(prof_buf_id)  ++profile_counter[prof_buf_id]
+#define PROF_OPEN                do { PROF_RESET; PROF_START(PROF_BUF_SIZE - 1); } while(0)
+#define PROF_START(prof_buf_id)  do { \
+        PROF_COUNT(prof_buf_id); profile_old[prof_buf_id] = (clock()); } while(0)
+#define PROF_CLOSE               do { PROF_END(PROF_BUF_SIZE - 1); PROF_OUTALL; } while(0)
+#define PROF_END(prof_buf_id)    profile_buf[prof_buf_id] += ((clock()) - profile_old[prof_buf_id])
+#define PROF_CLOCKOUT(prof_buf_id) \
+        LOGI("%s : clock is %f", __FUNCTION__, (clock() - profile_old[prof_buf_id]))
+#define PROF_OUTALL              do { LOGI("--- %s ---", __FUNCTION__); prof_out(); } while(0)
+
+static void prof_reset(void) {
+    for (int i = 0; i < PROF_BUF_SIZE; ++i) {
+        profile_buf[i] = 0;
+        profile_old[i] = 0;
+        profile_counter[i] = 0;
+    }
+}
+
+static void prof_out(void) {
+    if (profile_counter[PROF_BUF_SIZE - 1] != 1) {
+        LOGI("Error: You must call PROF_OPEN before PROF_CLOSE.");
+    }
+    LOGI("Total time is %6.3f ms.",
+            profile_buf[PROF_BUF_SIZE - 1] * 1000 / (double)CLOCKS_PER_SEC);
+    double all = 0;
+    for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
+        all += profile_buf[i];
+    }
+    if (all == 0) all = 1;
+    for (int i = 0; i < PROF_BUF_SIZE - 1; ++i) {
+        if (profile_buf[i] != 0) {
+            LOGI("(%d): Used %4.2f%%, %8.4f ms. Called %d times.",
+                    i, (profile_buf[i] * 100 / all),
+                    profile_buf[i] * 1000 / (double)CLOCKS_PER_SEC, profile_counter[i]);
+        }
+    }
+}
+
 #else // FLAG_DBG
+#define LOGE
 #define LOGI
 #define DEBUG_DICT false
 #define DEBUG_DICT_FULL false
 #define DEBUG_SHOW_FOUND_WORD false
 #define DEBUG_NODE false
 #define DEBUG_TRACE false
+
+#define PROF_BUF_SIZE 0
+#define PROF_RESET
+#define PROF_COUNT(prof_buf_id)
+#define PROF_OPEN
+#define PROF_START(prof_buf_id)
+#define PROF_CLOSE
+#define PROF_END(prof_buf_id)
+#define PROF_CLOCK_OUT(prof_buf_id)
+#define PROF_CLOCKOUT(prof_buf_id)
+#define PROF_OUTALL
+
 #endif // FLAG_DBG
 
 #ifndef U_SHORT_MAX
 #define U_SHORT_MAX 1 << 16
 #endif
 
+// Define this to use mmap() for dictionary loading.  Undefine to use malloc() instead of mmap().
+// We measured and compared performance of both, and found mmap() is fairly good in terms of
+// loading time, and acceptable even for several initial lookups which involve page faults.
+#define USE_MMAP_FOR_DICTIONARY
+
 // 22-bit address = ~4MB dictionary size limit, which on average would be about 200k-300k words
 #define ADDRESS_MASK 0x3FFFFF
 
diff --git a/native/src/dictionary.cpp b/native/src/dictionary.cpp
index 8d3290945..fe3375706 100644
--- a/native/src/dictionary.cpp
+++ b/native/src/dictionary.cpp
@@ -23,21 +23,23 @@
 
 namespace latinime {
 
-Dictionary::Dictionary(void *dict, int typedLetterMultiplier, int fullWordMultiplier,
+Dictionary::Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust,
+        int typedLetterMultiplier, int fullWordMultiplier,
         int maxWordLength, int maxWords, int maxAlternatives)
-    : DICT((unsigned char*) dict),
+    : mDict((unsigned char*) dict), mDictSize(dictSize),
+    mMmapFd(mmapFd), mDictBufAdjust(dictBufAdjust),
     // Checks whether it has the latest dictionary or the old dictionary
     IS_LATEST_DICT_VERSION((((unsigned char*) dict)[0] & 0xFF) >= DICTIONARY_VERSION_MIN) {
     if (DEBUG_DICT) {
         if (MAX_WORD_LENGTH_INTERNAL < maxWordLength) {
             LOGI("Max word length (%d) is greater than %d",
                     maxWordLength, MAX_WORD_LENGTH_INTERNAL);
-            LOGI("IN NATIVE SUGGEST Version: %d \n", (DICT[0] & 0xFF));
+            LOGI("IN NATIVE SUGGEST Version: %d", (mDict[0] & 0xFF));
         }
     }
-    mUnigramDictionary = new UnigramDictionary(DICT, typedLetterMultiplier, fullWordMultiplier,
+    mUnigramDictionary = new UnigramDictionary(mDict, typedLetterMultiplier, fullWordMultiplier,
             maxWordLength, maxWords, maxAlternatives, IS_LATEST_DICT_VERSION);
-    mBigramDictionary = new BigramDictionary(DICT, maxWordLength, maxAlternatives,
+    mBigramDictionary = new BigramDictionary(mDict, maxWordLength, maxAlternatives,
             IS_LATEST_DICT_VERSION, hasBigram(), this);
 }
 
@@ -47,7 +49,7 @@ Dictionary::~Dictionary() {
 }
 
 bool Dictionary::hasBigram() {
-    return ((DICT[1] & 0xFF) == 1);
+    return ((mDict[1] & 0xFF) == 1);
 }
 
 // TODO: use uint16_t instead of unsigned short
@@ -64,12 +66,12 @@ int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int le
     // returns address of bigram data of that word
     // return -99 if not found
 
-    int count = Dictionary::getCount(DICT, &pos);
+    int count = Dictionary::getCount(mDict, &pos);
     unsigned short currentChar = (unsigned short) word[offset];
     for (int j = 0; j < count; j++) {
-        unsigned short c = Dictionary::getChar(DICT, &pos);
-        int terminal = Dictionary::getTerminal(DICT, &pos);
-        int childPos = Dictionary::getAddress(DICT, &pos);
+        unsigned short c = Dictionary::getChar(mDict, &pos);
+        int terminal = Dictionary::getTerminal(mDict, &pos);
+        int childPos = Dictionary::getAddress(mDict, &pos);
         if (c == currentChar) {
             if (offset == length - 1) {
                 if (terminal) {
@@ -85,7 +87,7 @@ int Dictionary::isValidWordRec(int pos, unsigned short *word, int offset, int le
             }
         }
         if (terminal) {
-            Dictionary::getFreq(DICT, IS_LATEST_DICT_VERSION, &pos);
+            Dictionary::getFreq(mDict, IS_LATEST_DICT_VERSION, &pos);
         }
         // There could be two instances of each alphabet - upper and lower case. So continue
         // looking ...
diff --git a/native/src/dictionary.h b/native/src/dictionary.h
index da876242d..cef1cf9eb 100644
--- a/native/src/dictionary.h
+++ b/native/src/dictionary.h
@@ -25,8 +25,8 @@ namespace latinime {
 
 class Dictionary {
 public:
-    Dictionary(void *dict, int typedLetterMultipler, int fullWordMultiplier, int maxWordLength,
-            int maxWords, int maxAlternatives);
+    Dictionary(void *dict, int dictSize, int mmapFd, int dictBufAdjust, int typedLetterMultipler,
+            int fullWordMultiplier, int maxWordLength, int maxWords, int maxAlternatives);
     int getSuggestions(int *codes, int codesSize, unsigned short *outWords, int *frequencies,
             int *nextLetters, int nextLettersSize) {
         return mUnigramDictionary->getSuggestions(codes, codesSize, outWords, frequencies,
@@ -42,8 +42,10 @@ public:
     }
     bool isValidWord(unsigned short *word, int length);
     int isValidWordRec(int pos, unsigned short *word, int offset, int length);
-    void setAsset(void *asset) { mAsset = asset; }
-    void *getAsset() { return mAsset; }
+    void *getDict() { return (void *)mDict; }
+    int getDictSize() { return mDictSize; }
+    int getMmapFd() { return mMmapFd; }
+    int getDictBufAdjust() { return mDictBufAdjust; }
     ~Dictionary();
 
     // public static utility methods
@@ -62,11 +64,17 @@ public:
 private:
     bool hasBigram();
 
-    const unsigned char *DICT;
+    const unsigned char *mDict;
+
+    // Used only for the mmap version of dictionary loading, but we use these as dummy variables
+    // also for the malloc version.
+    const int mDictSize;
+    const int mMmapFd;
+    const int mDictBufAdjust;
+
     const bool IS_LATEST_DICT_VERSION;
-    void *mAsset;
-    BigramDictionary *mBigramDictionary;
     UnigramDictionary *mUnigramDictionary;
+    BigramDictionary *mBigramDictionary;
 };
 
 // ----------------------------------------------------------------------------
diff --git a/native/src/unigram_dictionary.cpp b/native/src/unigram_dictionary.cpp
index f679001cf..3f9bcd758 100644
--- a/native/src/unigram_dictionary.cpp
+++ b/native/src/unigram_dictionary.cpp
@@ -42,14 +42,20 @@ UnigramDictionary::UnigramDictionary(const unsigned char *dict, int typedLetterM
 UnigramDictionary::~UnigramDictionary() {}
 
 int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short *outWords,
-        int *frequencies, int *nextLetters, int nextLettersSize)
-{
+        int *frequencies, int *nextLetters, int nextLettersSize) {
+    PROF_OPEN;
+    PROF_START(0);
     initSuggestions(codes, codesSize, outWords, frequencies);
     if (DEBUG_DICT) assert(codesSize == mInputLength);
 
     const int MAX_DEPTH = min(mInputLength * MAX_DEPTH_MULTIPLIER, MAX_WORD_LENGTH);
+    PROF_END(0);
+
+    PROF_START(1);
     getSuggestionCandidates(-1, -1, -1, nextLetters, nextLettersSize, MAX_DEPTH);
+    PROF_END(1);
 
+    PROF_START(2);
     // Suggestion with missing character
     if (SUGGEST_WORDS_WITH_MISSING_CHARACTER) {
         for (int i = 0; i < codesSize; ++i) {
@@ -57,7 +63,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
             getSuggestionCandidates(i, -1, -1, NULL, 0, MAX_DEPTH);
         }
     }
+    PROF_END(2);
 
+    PROF_START(3);
     // Suggestion with excessive character
     if (SUGGEST_WORDS_WITH_EXCESSIVE_CHARACTER
             && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_EXCESSIVE_CHARACTER_SUGGESTION) {
@@ -66,7 +74,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
             getSuggestionCandidates(-1, i, -1, NULL, 0, MAX_DEPTH);
         }
     }
+    PROF_END(3);
 
+    PROF_START(4);
     // Suggestion with transposed characters
     // Only suggest words that length is mInputLength
     if (SUGGEST_WORDS_WITH_TRANSPOSED_CHARACTERS) {
@@ -75,7 +85,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
             getSuggestionCandidates(-1, -1, i, NULL, 0, mInputLength - 1);
         }
     }
+    PROF_END(4);
 
+    PROF_START(5);
     // Suggestions with missing space
     if (SUGGEST_WORDS_WITH_MISSING_SPACE_CHARACTER
             && mInputLength >= MIN_USER_TYPED_LENGTH_FOR_MISSING_SPACE_SUGGESTION) {
@@ -84,7 +96,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
             getMissingSpaceWords(mInputLength, i);
         }
     }
+    PROF_END(5);
 
+    PROF_START(6);
     // Get the word count
     int suggestedWordsCount = 0;
     while (suggestedWordsCount < MAX_WORDS && mFrequencies[suggestedWordsCount] > 0) {
@@ -99,9 +113,9 @@ int UnigramDictionary::getSuggestions(int *codes, int codesSize, unsigned short
                 LOGI("%c = %d,", k, nextLetters[k]);
             }
         }
-        LOGI("\n");
     }
-
+    PROF_END(6);
+    PROF_CLOSE;
     return suggestedWordsCount;
 }
 
@@ -254,6 +268,14 @@ void UnigramDictionary::getSuggestionCandidates(const int skipPos,
     }
 }
 
+inline static void multiplyRate(const int rate, int *freq) {
+    if (rate > 1000000) {
+        *freq = (*freq / 100) * rate;
+    } else {
+        *freq = *freq * rate / 100;
+    }
+}
+
 bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int missingSpacePos) {
     if (missingSpacePos <= 0 || missingSpacePos >= inputLength
             || inputLength >= MAX_WORD_LENGTH) return false;
@@ -279,7 +301,7 @@ bool UnigramDictionary::getMissingSpaceWords(const int inputLength, const int mi
 
     int pairFreq = ((firstFreq + secondFreq) / 2);
     for (int i = 0; i < inputLength; ++i) pairFreq *= TYPED_LETTER_MULTIPLIER;
-    pairFreq = pairFreq * WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE / 100;
+    multiplyRate(WORDS_WITH_MISSING_SPACE_CHARACTER_DEMOTION_RATE, &pairFreq);
     addWord(word, newWordLength, pairFreq);
     return true;
 }
@@ -330,14 +352,13 @@ inline int UnigramDictionary::calculateFinalFreq(const int inputIndex, const int
         const bool sameLength) {
     // TODO: Demote by edit distance
     int finalFreq = freq * snr;
-    if (skipPos >= 0) finalFreq = finalFreq * WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE / 100;
-    if (transposedPos >= 0) finalFreq = finalFreq
-            * WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE / 100;
+    if (skipPos >= 0) multiplyRate(WORDS_WITH_MISSING_CHARACTER_DEMOTION_RATE, &finalFreq);
+    if (transposedPos >= 0) multiplyRate(
+            WORDS_WITH_TRANSPOSED_CHARACTERS_DEMOTION_RATE, &finalFreq);
     if (excessivePos >= 0) {
-        finalFreq = finalFreq * WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE / 100;
+        multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_DEMOTION_RATE, &finalFreq);
         if (!existsAdjacentProximityChars(inputIndex, mInputLength)) {
-            finalFreq = finalFreq
-                    * WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE / 100;
+            multiplyRate(WORDS_WITH_EXCESSIVE_CHARACTER_OUT_OF_PROXIMITY_DEMOTION_RATE, &finalFreq);
         }
     }
     if (sameLength && skipPos < 0) finalFreq *= FULL_WORD_MULTIPLIER;
diff --git a/native/src/unigram_dictionary.h b/native/src/unigram_dictionary.h
index 445ff7a17..7f7b7bd21 100644
--- a/native/src/unigram_dictionary.h
+++ b/native/src/unigram_dictionary.h
@@ -80,13 +80,13 @@ private:
     bool existsAdjacentProximityChars(const int inputIndex, const int inputLength);
     int* getInputCharsAt(const int index) {return mInputCodes + (index * MAX_PROXIMITY_CHARS);}
     const unsigned char *DICT;
-    const int MAX_WORDS;
     const int MAX_WORD_LENGTH;
+    const int MAX_WORDS;
     const int MAX_PROXIMITY_CHARS;
     const bool IS_LATEST_DICT_VERSION;
-    const int ROOT_POS;
     const int TYPED_LETTER_MULTIPLIER;
     const int FULL_WORD_MULTIPLIER;
+    const int ROOT_POS;
 
     int *mFrequencies;
     unsigned short *mOutputChars;