Honeycomb  0.1
Component-Model Framework
BloomFilter.h
Go to the documentation of this file.
1 // Honeycomb, Copyright (C) 2015 NewGamePlus Inc. Distributed under the Boost Software License v1.0.
2 #pragma once
3 
4 #include "Honey/Misc/BitSet.h"
5 #include "Honey/String/Id.h"
6 #include "Honey/Math/Alge/Alge.h"
7 
8 namespace honey
9 {
10 
12 namespace bloom_filter
13 {
15  namespace priv
16  {
17  extern const szt seedCount;
18  extern const szt seeds[];
19  }
22  template<class T>
24  struct hash
25  {
26  szt operator()(const T& val, szt hashIndex) const { return honey::hash::fast(ByteBufConst(reinterpret_cast<const byte*>(&val), sizeof(T)), priv::seeds[hashIndex]); }
27  };
28 
30  inline tuple<szt,szt> calcParams(szt elemCount, Double errorProb)
31  {
32  // Optimal hash count (k) formula that minimizes error (p):
33  // m = -n*ln(p) / ln(2)^2
34  szt bitCount = Alge_d::ceil(-Double(elemCount) * Alge_d::log(errorProb) / Alge_d::sqr(Alge_d::log(2)));
35  // k = ln(2) * m / n
36  szt hashCount = Alge_d::ceil(Alge_d::log(2) * bitCount / elemCount);
37  return make_tuple(bitCount, hashCount);
38  }
39 
41  template<class T, class Alloc = std::allocator<szt>>
42  struct Key
43  {
44  Key() {}
46  Key(szt elemCount, Double errorProb = 0.01, const Alloc& a = Alloc()) :
47  hashes(get<1>(calcParams(elemCount, errorProb)), 0, a) {}
48 
49  bool operator==(const Key& rhs) const { return hashes == rhs.hashes; }
50 
52  void hash(const T& obj) { for (szt i = 0; i < hashes.size(); ++i) hashes[i] = hasher(obj,i); }
53 
54  vector<szt,Alloc> hashes;
56  };
57 
59  template<class T, class Alloc>
61  struct hash<Key<T,Alloc>>
62  {
63  szt operator()(const Key<T,Alloc>& val, szt hashIndex) const
64  { assert(hashIndex < val.hashes.size(), "Key does not have enough hashes for bloom filter"); return val.hashes[hashIndex]; }
65  };
67 }
68 
70 
75 template<class T, class Block = uint64, class Alloc = std::allocator<Block>>
77 {
78 public:
80 
86  BloomFilter(szt elemCount, Double errorProb = 0.01, const Alloc& a = Alloc()) :
87  _errorProb(errorProb),
88  _bits(0, false, a)
89  {
90  szt bitCount;
91  tie(bitCount, _hashCount) = bloom_filter::calcParams(elemCount, errorProb);
92  assert(_hashCount <= bloom_filter::priv::seedCount, "Not enough seeds, either try a higher error prob or add more seeds");
93  // Round up to nearest power of two so that hash can be converted to index without a modulo
94  _bits.resize(BitOp::pow2Ceil(bitCount));
95  _bitIndexMask = _bits.size()-1;
96  }
97 
99  void insert(const T& obj)
100  {
101  for (szt i = 0; i < _hashCount; ++i)
102  _bits.set(bitIndex(_hasher(obj, i)));
103  }
104 
106  bool contains(const T& obj) const
107  {
108  for (szt i = 0; i < _hashCount; ++i)
109  if (!_bits.test(bitIndex(_hasher(obj, i)))) return false;
110  return true;
111  }
112 
114  void clear() { _bits.reset(); }
115 
117  Double errorProb() const { return _errorProb; }
119  const BitSet& bits() const { return _bits; }
120 
121 private:
123  szt bitIndex(szt hash) const { return hash & _bitIndexMask; }
124 
125  Double _errorProb;
126  BitSet _bits;
127  szt _bitIndexMask;
128  szt _hashCount;
129  bloom_filter::hash<T> _hasher;
130 };
131 
132 }
133 
134 namespace std
135 {
137  template<class T, class Alloc>
138  struct hash<honey::bloom_filter::Key<T,Alloc>>
139  {
141  {
142  using namespace honey;
143  assert(val.hashes.size());
144  return val.hashes[0];
145  }
146  };
147 }
szt fast(ByteBufConst bs, szt seed)
Quickly generate a small hash value. Each seed value produces a unique hash from the same data...
Definition: Hash.cpp:99
Double_::Real Double
double type
Definition: Double.h:60
Key()
Definition: BloomFilter.h:44
A space-efficient probabilistic data structure that is used to test set membership. Can be tuned to use less space at the expense of increased false positive probabilty.
Definition: BloomFilter.h:76
Buffer< const byte > ByteBufConst
Definition: Bytes.h:21
szt size() const
Number of bits in the bit array.
Definition: BitSet.h:103
STL namespace.
void reset(szt index)
Set bit to false.
Definition: BitSet.h:76
BitSet_< Block, Alloc > BitSet
Definition: BloomFilter.h:79
static Real ceil(Real x)
Round up to the nearest whole number towards +inf.
Definition: Alge.h:33
Functor used to generate hash. Each hashIndex for the same object must produce a unique hash...
Definition: BloomFilter.h:24
Caches multiple hashes of an object. The key can be inserted into a bloom filter and tested very quic...
Definition: BloomFilter.h:42
void clear()
Remove all elements from the set.
Definition: BloomFilter.h:114
void set(szt index)
Set bit to true.
Definition: BitSet.h:70
BloomFilter(szt elemCount, Double errorProb=0.01, const Alloc &a=Alloc())
Definition: BloomFilter.h:86
tuple< szt, szt > calcParams(szt elemCount, Double errorProb)
Calculate optimal bloom parameters – bit and hash count.
Definition: BloomFilter.h:30
vector< szt, Alloc > hashes
Definition: BloomFilter.h:54
#define assert(...)
Forwards to assert_#args. See assert_1(), assert_2().
Definition: Debug.h:24
bool contains(const T &obj) const
Check if element is in the set, may return false positive.
Definition: BloomFilter.h:106
void resize(szt size, bool val=false)
Resize array to contain size number of bits, with new bits initialized to val
Definition: BitSet.h:38
bloom_filter::hash< T > hasher
Definition: BloomFilter.h:55
void hash(const T &obj)
Generate and cache all the hashes for the object.
Definition: BloomFilter.h:52
size_t szt
Size type, shorthand for size_t.
Definition: Core.h:90
Key(szt elemCount, Double errorProb=0.01, const Alloc &a=Alloc())
Same params as bloom filter, key will cache the required number of hashes.
Definition: BloomFilter.h:46
bool operator==(const Key &rhs) const
Definition: BloomFilter.h:49
static Num sqr(Num x)
Square.
Definition: Alge.h:60
void insert(const T &obj)
Insert element into set.
Definition: BloomFilter.h:99
size_t operator()(const honey::bloom_filter::Key< T, Alloc > &val) const
Definition: BloomFilter.h:140
Double errorProb() const
Get false positive probability.
Definition: BloomFilter.h:117
bool test(szt index) const
Get bit value.
Definition: BitSet.h:84
Global Honeycomb namespace.
szt operator()(const T &val, szt hashIndex) const
Definition: BloomFilter.h:26
static Real log(Real x)
Natural logarithm. ie. ln(x)
Definition: Alge.h:80
const BitSet & bits() const
Get underlying bit array.
Definition: BloomFilter.h:119