aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-04-09 16:18:23 +0900
committerKeisuke Kuroyanagi <ksk@google.com>2014-04-09 16:18:23 +0900
commit2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376 (patch)
treee467bc2dc6ce6a46905bf169e60004852364e4f7
parent113523d22db3b183203abe475baebbc6ad0c486c (diff)
downloadlatinime-2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376.tar.gz
latinime-2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376.tar.xz
latinime-2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376.zip
Use bitset for BloomFilter.
Before: (0) 660.00 (1.43%) (1) 45320.00 (98.18%) (2) 80.00 (0.17%) Total 46160.00 (sum of others 46060.00) After: (0) 620.00 (1.34%) (1) 45310.00 (98.05%) (2) 130.00 (0.28%) Total 46210.00 (sum of others 46060.00) Change-Id: I936b639c50e15208aee999a929b33983c6caa59d
-rw-r--r--native/jni/NativeFileList.mk1
-rw-r--r--native/jni/src/suggest/core/dictionary/bloom_filter.cpp25
-rw-r--r--native/jni/src/suggest/core/dictionary/bloom_filter.h39
3 files changed, 17 insertions, 48 deletions
diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk
index 3da9d5632..23b037701 100644
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@@ -27,7 +27,6 @@ LATIN_IME_CORE_SRC_FILES := \
dic_nodes_cache.cpp) \
$(addprefix suggest/core/dictionary/, \
bigram_dictionary.cpp \
- bloom_filter.cpp \
dictionary.cpp \
digraph_utils.cpp \
error_type_utils.cpp \
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp b/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
deleted file mode 100644
index 4ae474e0c..000000000
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/core/dictionary/bloom_filter.h"
-
-namespace latinime {
-
-// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
-// prime under 128 * 8.
-const int BloomFilter::BIGRAM_FILTER_MODULO = 1021;
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h
index 85b8fc397..1e60f49ed 100644
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.h
+++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h
@@ -17,8 +17,7 @@
#ifndef LATINIME_BLOOM_FILTER_H
#define LATINIME_BLOOM_FILTER_H
-#include <cstdint>
-#include <cstring>
+#include <bitset>
#include "defines.h"
@@ -34,41 +33,37 @@ namespace latinime {
// Total 148603.14 (sum of others 148579.90)
class BloomFilter {
public:
- BloomFilter() {
- ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
- memset(mFilter, 0, sizeof(mFilter));
- }
+ BloomFilter() : mFilter() {}
- // TODO: uint32_t position
- AK_FORCE_INLINE void setInFilter(const int32_t position) {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
+ AK_FORCE_INLINE void setInFilter(const int position) {
+ mFilter.set(getIndex(position));
}
- // TODO: uint32_t position
- AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
+ AK_FORCE_INLINE bool isInFilter(const int position) const {
+ return mFilter.test(getIndex(position));
}
private:
DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
- // Size, in bytes, of the bloom filter index for bigrams
- // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
+ AK_FORCE_INLINE size_t getIndex(const int position) const {
+ return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
+ }
+
+ // Size, in bits, of the bloom filter index for bigrams
+ // The probability of false positive is (1 - e ** (-kn/m))**k,
// where k is the number of hash functions, n the number of bigrams, and m the number of
// bits we can test.
- // At the moment 100 is the maximum number of bigrams for a word with the current
+ // At the moment 100 is the maximum number of bigrams for a word with the current main
// dictionaries, so n = 100. 1024 buckets give us m = 1024.
// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
// our uses since we are only using this to increase average performance. For the record,
// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
// and m = 4096 gives 2.4%.
- // This is assigned here because it is used for array size.
- static const int BIGRAM_FILTER_BYTE_SIZE = 128;
- static const int BIGRAM_FILTER_MODULO;
-
- uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
+ // This is assigned here because it is used for bitset size.
+ // 1021 is the largest prime under 1024.
+ static const size_t BIGRAM_FILTER_MODULO = 1021;
+ std::bitset<BIGRAM_FILTER_MODULO> mFilter;
};
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H