aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKeisuke Kuroyanagi <ksk@google.com>2014-04-09 07:39:49 +0000
committerAndroid (Google) Code Review <android-gerrit@google.com>2014-04-09 07:39:49 +0000
commite3d57ae792779a6e5588cb97885970cb1adef312 (patch)
treefc42b5408b8d2f18a5822db11b66076bb4053443
parentbbefa8c8265820f5f9aeb0e2a49a4526d7cdd682 (diff)
parent2fd0bf9a37b510e1a79dd8c43ed6f6d5919dc376 (diff)
downloadlatinime-e3d57ae792779a6e5588cb97885970cb1adef312.tar.gz
latinime-e3d57ae792779a6e5588cb97885970cb1adef312.tar.xz
latinime-e3d57ae792779a6e5588cb97885970cb1adef312.zip
Merge "Use bitset for BloomFilter."
-rw-r--r--native/jni/NativeFileList.mk1
-rw-r--r--native/jni/src/suggest/core/dictionary/bloom_filter.cpp25
-rw-r--r--native/jni/src/suggest/core/dictionary/bloom_filter.h39
3 files changed, 17 insertions, 48 deletions
diff --git a/native/jni/NativeFileList.mk b/native/jni/NativeFileList.mk
index ce3975265..a22497fd9 100644
--- a/native/jni/NativeFileList.mk
+++ b/native/jni/NativeFileList.mk
@@ -27,7 +27,6 @@ LATIN_IME_CORE_SRC_FILES := \
dic_nodes_cache.cpp) \
$(addprefix suggest/core/dictionary/, \
bigram_dictionary.cpp \
- bloom_filter.cpp \
dictionary.cpp \
digraph_utils.cpp \
error_type_utils.cpp \
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp b/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
deleted file mode 100644
index 4ae474e0c..000000000
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Copyright (C) 2013, The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "suggest/core/dictionary/bloom_filter.h"
-
-namespace latinime {
-
-// Must be smaller than BIGRAM_FILTER_BYTE_SIZE * 8, and preferably prime. 1021 is the largest
-// prime under 128 * 8.
-const int BloomFilter::BIGRAM_FILTER_MODULO = 1021;
-
-} // namespace latinime
diff --git a/native/jni/src/suggest/core/dictionary/bloom_filter.h b/native/jni/src/suggest/core/dictionary/bloom_filter.h
index 85b8fc397..1e60f49ed 100644
--- a/native/jni/src/suggest/core/dictionary/bloom_filter.h
+++ b/native/jni/src/suggest/core/dictionary/bloom_filter.h
@@ -17,8 +17,7 @@
#ifndef LATINIME_BLOOM_FILTER_H
#define LATINIME_BLOOM_FILTER_H
-#include <cstdint>
-#include <cstring>
+#include <bitset>
#include "defines.h"
@@ -34,41 +33,37 @@ namespace latinime {
// Total 148603.14 (sum of others 148579.90)
class BloomFilter {
public:
- BloomFilter() {
- ASSERT(BIGRAM_FILTER_BYTE_SIZE * 8 >= BIGRAM_FILTER_MODULO);
- memset(mFilter, 0, sizeof(mFilter));
- }
+ BloomFilter() : mFilter() {}
- // TODO: uint32_t position
- AK_FORCE_INLINE void setInFilter(const int32_t position) {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- mFilter[bucket >> 3] |= static_cast<uint8_t>(1 << (bucket & 0x7));
+ AK_FORCE_INLINE void setInFilter(const int position) {
+ mFilter.set(getIndex(position));
}
- // TODO: uint32_t position
- AK_FORCE_INLINE bool isInFilter(const int32_t position) const {
- const uint32_t bucket = static_cast<uint32_t>(position % BIGRAM_FILTER_MODULO);
- return (mFilter[bucket >> 3] & static_cast<uint8_t>(1 << (bucket & 0x7))) != 0;
+ AK_FORCE_INLINE bool isInFilter(const int position) const {
+ return mFilter.test(getIndex(position));
}
private:
DISALLOW_ASSIGNMENT_OPERATOR(BloomFilter);
- // Size, in bytes, of the bloom filter index for bigrams
- // 128 gives us 1024 buckets. The probability of false positive is (1 - e ** (-kn/m))**k,
+ AK_FORCE_INLINE size_t getIndex(const int position) const {
+ return static_cast<size_t>(position) % BIGRAM_FILTER_MODULO;
+ }
+
+ // Size, in bits, of the bloom filter index for bigrams
+ // The probability of false positive is (1 - e ** (-kn/m))**k,
// where k is the number of hash functions, n the number of bigrams, and m the number of
// bits we can test.
- // At the moment 100 is the maximum number of bigrams for a word with the current
+ // At the moment 100 is the maximum number of bigrams for a word with the current main
// dictionaries, so n = 100. 1024 buckets give us m = 1024.
// With 1 hash function, our false positive rate is about 9.3%, which should be enough for
// our uses since we are only using this to increase average performance. For the record,
// k = 2 gives 3.1% and k = 3 gives 1.6%. With k = 1, making m = 2048 gives 4.8%,
// and m = 4096 gives 2.4%.
- // This is assigned here because it is used for array size.
- static const int BIGRAM_FILTER_BYTE_SIZE = 128;
- static const int BIGRAM_FILTER_MODULO;
-
- uint8_t mFilter[BIGRAM_FILTER_BYTE_SIZE];
+ // This is assigned here because it is used for bitset size.
+ // 1021 is the largest prime under 1024.
+ static const size_t BIGRAM_FILTER_MODULO = 1021;
+ std::bitset<BIGRAM_FILTER_MODULO> mFilter;
};
} // namespace latinime
#endif // LATINIME_BLOOM_FILTER_H