Skip to content

Commit 630fced

Browse files
authored
Merge pull request #16 from dferreyra/BloomFilter64
Add BloomFilter64 to support large Bloom filters
2 parents edb3da6 + 3e8bcee commit 630fced

File tree

9 files changed

+1058
-0
lines changed

9 files changed

+1058
-0
lines changed
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
using ProbabilisticDataStructures;
7+
using System.Security.Cryptography;
8+
9+
namespace ProbabilisticDataStructures
10+
{
11+
/// <summary>
12+
/// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero
13+
/// probability of false positives and a zero probability of false negatives.
14+
/// </summary>
15+
public class BloomFilter64 : IFilter
16+
{
17+
/// <summary>
18+
/// Filter data
19+
/// </summary>
20+
internal Buckets64 Buckets { get; set; }
21+
/// <summary>
22+
/// Hash algorithm
23+
/// </summary>
24+
private HashAlgorithm Hash { get; set; }
25+
/// <summary>
26+
/// Filter size
27+
/// </summary>
28+
private ulong m { get; set; }
29+
/// <summary>
30+
/// Number of hash functions
31+
/// </summary>
32+
private uint k { get; set; }
33+
/// <summary>
34+
/// Number of items added
35+
/// </summary>
36+
private ulong count { get; set; }
37+
38+
/// <summary>
39+
/// Creates a new Bloom filter optimized to store n items with a specified target
40+
/// false-positive rate.
41+
/// </summary>
42+
/// <param name="n">Number of items to store.</param>
43+
/// <param name="fpRate">Desired false positive rate.</param>
44+
public BloomFilter64(ulong n, double fpRate)
45+
{
46+
var m = Utils.OptimalM64(n, fpRate);
47+
var k = Utils.OptimalK(fpRate);
48+
Buckets = new Buckets64(m, 1);
49+
Hash = Defaults.GetDefaultHashAlgorithm();
50+
this.m = m;
51+
this.k = k;
52+
}
53+
54+
/// <summary>
55+
/// Returns the Bloom filter capacity, m.
56+
/// </summary>
57+
/// <returns>The Bloom filter capacity, m.</returns>
58+
public ulong Capacity()
59+
{
60+
return this.m;
61+
}
62+
63+
/// <summary>
64+
/// Returns the number of hash functions.
65+
/// </summary>
66+
/// <returns>The number of hash functions.</returns>
67+
public uint K()
68+
{
69+
return this.k;
70+
}
71+
72+
/// <summary>
73+
/// Returns the number of items in the filter.
74+
/// </summary>
75+
/// <returns></returns>
76+
public ulong Count()
77+
{
78+
return this.count;
79+
}
80+
81+
/// <summary>
82+
/// Returns the current estimated ratio of set bits.
83+
/// </summary>
84+
/// <returns>The current estimated ratio of set bits.</returns>
85+
public double EstimatedFillRatio()
86+
{
87+
return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
88+
}
89+
90+
/// <summary>
91+
/// Returns the ratio of set bits.
92+
/// </summary>
93+
/// <returns>The ratio of set bits.</returns>
94+
public double FillRatio()
95+
{
96+
ulong sum = 0;
97+
for (ulong i = 0; i < this.Buckets.count; i++)
98+
{
99+
sum += this.Buckets.Get(i);
100+
}
101+
return (double)sum / (double)this.m;
102+
}
103+
104+
/// <summary>
105+
/// Will test for membership of the data and returns true if it is a member,
106+
/// false if not. This is a probabilistic test, meaning there is a non-zero
107+
/// probability of false positives but a zero probability of false negatives.
108+
/// </summary>
109+
/// <param name="data">The data to search for.</param>
110+
/// <returns>Whether or not the data is maybe contained in the filter.</returns>
111+
public bool Test(byte[] data)
112+
{
113+
var hashKernel = Utils.HashKernel128(data, this.Hash);
114+
var lower = hashKernel.LowerBaseHash;
115+
var upper = hashKernel.UpperBaseHash;
116+
117+
// If any of the K bits are not set, then it's not a member.
118+
for (uint i = 0; i < this.k; i++)
119+
{
120+
if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
121+
{
122+
return false;
123+
}
124+
}
125+
return true;
126+
}
127+
128+
/// <summary>
129+
/// Will add the data to the Bloom filter. It returns the filter to allow
130+
/// for chaining.
131+
/// </summary>
132+
/// <param name="data">The data to add.</param>
133+
/// <returns>The filter.</returns>
134+
public IFilter Add(byte[] data)
135+
{
136+
var hashKernel = Utils.HashKernel128(data, this.Hash);
137+
var lower = hashKernel.LowerBaseHash;
138+
var upper = hashKernel.UpperBaseHash;
139+
140+
// Set the K bits.
141+
for (uint i = 0; i < this.k; i++)
142+
{
143+
this.Buckets.Set((lower + upper * i) % this.m, 1);
144+
}
145+
146+
this.count++;
147+
return this;
148+
}
149+
150+
/// <summary>
151+
/// Is equivalent to calling Test followed by Add. It returns true if the data is
152+
/// a member, false if not.
153+
/// </summary>
154+
/// <param name="data">The data to test for and add if it doesn't exist.</param>
155+
/// <returns>Whether or not the data was probably contained in the filter.</returns>
156+
public bool TestAndAdd(byte[] data)
157+
{
158+
var hashKernel = Utils.HashKernel128(data, this.Hash);
159+
var lower = hashKernel.LowerBaseHash;
160+
var upper = hashKernel.UpperBaseHash;
161+
var member = true;
162+
163+
// If any of the K bits are not set, then it's not a member.
164+
for (uint i = 0; i < this.k; i++)
165+
{
166+
var idx = (lower + upper * i) % this.m;
167+
if (this.Buckets.Get(idx) == 0)
168+
{
169+
member = false;
170+
}
171+
this.Buckets.Set(idx, 1);
172+
}
173+
174+
this.count++;
175+
return member;
176+
}
177+
178+
/// <summary>
179+
/// Restores the Bloom filter to its original state. It returns the filter to
180+
/// allow for chaining.
181+
/// </summary>
182+
/// <returns>The reset bloom filter.</returns>
183+
public BloomFilter64 Reset()
184+
{
185+
this.Buckets.Reset();
186+
return this;
187+
}
188+
189+
/// <summary>
190+
/// Sets the hashing function used in the filter.
191+
/// </summary>
192+
/// <param name="h">The HashAlgorithm to use.</param>
193+
// TODO: Add SetHash to the IFilter interface?
194+
public void SetHash(HashAlgorithm h)
195+
{
196+
this.Hash = h;
197+
}
198+
}
199+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
namespace ProbabilisticDataStructures
8+
{
9+
/// <summary>
10+
/// Buckets64 is a fast, space-efficient array of buckets where each bucket can store
11+
/// up to a configured maximum value.
12+
/// </summary>
13+
public class Buckets64
14+
{
15+
// The largest C# array to create; the largest power of 2 that C# can support.
16+
private const uint maxArraySize = 1U << 30;
17+
private byte[][] Data { get; set; }
18+
private int arrayCount { get; set; }
19+
private byte bucketSize { get; set; }
20+
private byte _max;
21+
private int Max
22+
{
23+
get
24+
{
25+
return _max;
26+
}
27+
set
28+
{
29+
// TODO: Figure out this truncation thing.
30+
// I'm not sure if MaxValue is always supposed to be capped at 255 via
31+
// a byte conversion or not...
32+
if (value > byte.MaxValue)
33+
_max = byte.MaxValue;
34+
else
35+
_max = (byte)value;
36+
}
37+
}
38+
internal ulong count { get; set; }
39+
40+
/// <summary>
41+
/// Creates a new Buckets64 with the provided number of buckets where each bucket
42+
/// is the specified number of bits.
43+
/// </summary>
44+
/// <param name="count">Number of buckets.</param>
45+
/// <param name="bucketSize">Number of bits per bucket.</param>
46+
internal Buckets64(ulong count, byte bucketSize)
47+
{
48+
this.count = count;
49+
this.bucketSize = bucketSize;
50+
AllocateArray(count, bucketSize);
51+
this.Max = (1 << bucketSize) - 1;
52+
}
53+
54+
private void AllocateArray(ulong count, byte bucketSize)
55+
{
56+
this.arrayCount = (int)(count / maxArraySize + 1);
57+
this.Data = new byte[this.arrayCount][];
58+
var bytesToAllocate = (count * bucketSize + 7) / 8;
59+
for (int i = 0; i < this.arrayCount; i++)
60+
{
61+
var arraySize = Math.Min(bytesToAllocate, maxArraySize);
62+
this.Data[i] = new byte[arraySize];
63+
bytesToAllocate -= arraySize;
64+
}
65+
}
66+
67+
/// <summary>
68+
/// Returns the maximum value that can be stored in a bucket.
69+
/// </summary>
70+
/// <returns>The bucket max value.</returns>
71+
internal byte MaxBucketValue()
72+
{
73+
return this._max;
74+
}
75+
76+
/// <summary>
77+
/// Increment the value in the specified bucket by the provided delta. A bucket
78+
/// can be decremented by providing a negative delta.
79+
/// <para>
80+
/// The value is clamped to zero and the maximum bucket value. Returns itself
81+
/// to allow for chaining.
82+
/// </para>
83+
/// </summary>
84+
/// <param name="bucket">The bucket to increment.</param>
85+
/// <param name="delta">The amount to increment the bucket by.</param>
86+
/// <returns>The modified bucket.</returns>
87+
internal Buckets64 Increment(uint bucket, int delta)
88+
{
89+
int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);
90+
91+
if (val > this.Max)
92+
val = this.Max;
93+
else if (val < 0)
94+
val = 0;
95+
96+
SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
97+
return this;
98+
}
99+
100+
/// <summary>
101+
/// Set the bucket value. The value is clamped to zero and the maximum bucket
102+
/// value. Returns itself to allow for chaining.
103+
/// </summary>
104+
/// <param name="bucket">The bucket to change the value of.</param>
105+
/// <param name="value">The value to set.</param>
106+
/// <returns>The modified bucket.</returns>
107+
internal Buckets64 Set(ulong bucket, byte value)
108+
{
109+
if (value > this._max)
110+
value = this._max;
111+
112+
SetBits(bucket * this.bucketSize, this.bucketSize, value);
113+
return this;
114+
}
115+
116+
/// <summary>
117+
/// Returns the value in the specified bucket.
118+
/// </summary>
119+
/// <param name="bucket">The bucket to get.</param>
120+
/// <returns>The specified bucket.</returns>
121+
internal uint Get(ulong bucket)
122+
{
123+
return GetBits(bucket * this.bucketSize, this.bucketSize);
124+
}
125+
126+
/// <summary>
127+
/// Restores the Buckets64 to the original state. Returns itself to allow for
128+
/// chaining.
129+
/// </summary>
130+
/// <returns>The Buckets64 object the reset operation was performed on.</returns>
131+
internal Buckets64 Reset()
132+
{
133+
AllocateArray(this.count, this.bucketSize);
134+
return this;
135+
}
136+
137+
/// <summary>
138+
/// Returns the bits at the specified offset and length.
139+
/// </summary>
140+
/// <param name="offset">The position to start reading at.</param>
141+
/// <param name="length">The distance to read from the offset.</param>
142+
/// <returns>The bits at the specified offset and length.</returns>
143+
internal uint GetBits(ulong offset, int length)
144+
{
145+
ulong byteIndex = offset / 8;
146+
int byteOffset = (int)(offset % 8);
147+
148+
if ((byteOffset + length) > 8)
149+
{
150+
int rem = 8 - byteOffset;
151+
return GetBits(offset, rem)
152+
| (GetBits(offset + (ulong)rem, length - rem) << rem);
153+
}
154+
155+
var dataArray = this.Data[byteIndex / maxArraySize];
156+
var dataArrayByteIndex = byteIndex % maxArraySize;
157+
int bitMask = (1 << length) - 1;
158+
return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset);
159+
}
160+
161+
/// <summary>
162+
/// Sets bits at the specified offset and length.
163+
/// </summary>
164+
/// <param name="offset">The position to start writing at.</param>
165+
/// <param name="length">The distance to write from the offset.</param>
166+
/// <param name="bits">The bits to write.</param>
167+
internal void SetBits(ulong offset, int length, uint bits)
168+
{
169+
ulong byteIndex = offset / 8;
170+
int byteOffset = (int)(offset % 8);
171+
172+
if ((byteOffset + length) > 8)
173+
{
174+
int rem = 8 - byteOffset;
175+
SetBits(offset, (byte)rem, bits);
176+
SetBits(offset + (ulong)rem, length - rem, bits >> rem);
177+
return;
178+
}
179+
180+
var dataArray = this.Data[(uint)(byteIndex / maxArraySize)];
181+
var dataArrayByteIndex = (uint)(byteIndex % maxArraySize);
182+
int bitMask = (1 << length) - 1;
183+
dataArray[dataArrayByteIndex] =
184+
(byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset));
185+
dataArray[dataArrayByteIndex] =
186+
(byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset));
187+
}
188+
}
189+
}

0 commit comments

Comments
 (0)