@@ -3,48 +3,109 @@ using BitIntegers
33using Base: count_ones
44import Base: in!, iterate, IteratorSize
55
6- function _hash_bloom (x, seeds, T)
6+
7+
8+ mutable struct BloomFilterNode{T,K,L}
9+ memory:: T
10+ seeds:: NTuple{K, UInt64}
11+ const next:: L
12+ function BloomFilterNode {T,K} (seeds, next= nothing ) where {T<: Unsigned ,K}
13+ new {T,K,typeof(next)} (zero (T), seeds, next)
14+ end
15+ end
16+
17+ struct BloomFilterChain{T,K,L}
18+ head:: L
19+ function BloomFilterChain {T,K} (head) where {T<: Unsigned ,K}
20+ new {T,K,typeof(head)} (head)
21+ end
22+ end
23+
24+ function BloomFilterChain {T,K} () where {T,K}
25+ seeds = ntuple (_ -> rand (UInt64), K)
26+ head = BloomFilterNode {T,K} (seeds)
27+ return BloomFilterChain {T,K} (head)
28+ end
29+
30+ function grow_chain (f:: BloomFilterChain{T,K} ) where {T,K}
31+ # wT = widen(T)
32+ wT = T
33+ new_seeds = ntuple (_ -> rand (UInt64), K)
34+ head = BloomFilterNode {wT,K} (new_seeds, f. head)
35+ return BloomFilterChain {wT,K} (head)
36+ end
37+ function into (f:: BloomFilterChain{T,K} , x; p = 0.05 ) where {T,K}
38+
39+ # wT = widen(T)
40+ wT = T
41+ new_seeds = ntuple (_ -> rand (UInt64), K)
42+ head = BloomFilterNode {wT,K} (new_seeds, f. head)
43+ return BloomFilterChain {wT,K} (head)
44+ end
45+
46+
47+ function _hash_bloom (x, seeds, :: Type{T} ) where T
748 out = zero (T)
49+ bits = 8 sizeof (T)
850 for h in seeds
9- out |= one (T)<< (hash (x, h) % ( 8 sizeof (T) ))
51+ out |= one (T)<< (hash (x, h) & (bits - 1 ))
1052 end
1153 return out
1254end
1355
14- mutable struct BloomFilter{T,K}
15- memory:: T
16- const seeds:: NTuple{K, UInt64}
17- const p:: Float64 # max tolerable false positive rate
18- function BloomFilter {T,K} (p, seeds) where {T<: Unsigned ,K}
19- new {T,K} (zero (T), seeds, p)
20- end
56+ function _in (x, f:: BloomFilterNode{T,K,Nothing} ) where {T,K}
57+ h = _hash_bloom (x, f. seeds, T)
58+ return (f. memory & h) == h
2159end
2260
23- function BloomFilter (; k= 5 , p= 1 / 16 )
24- return BloomFilter {UInt64, k} (p, ntuple (_ -> rand (UInt), k))
61+ function _in (x, f:: BloomFilterNode{T} ) where {T}
62+ h = _hash_bloom (x, f. seeds, T)
63+ return ((f. memory & h) == h) || _in (x, f. next)
2564end
2665
27- function in ! (x, f:: BloomFilter{T } ) where {T}
66+ function _in ! (x, f:: BloomFilterNode{T,K,Nothing } ) where {T,K }
2867 h = _hash_bloom (x, f. seeds, T)
29- r = (f. memory & h) == h
68+ r = (( f. memory & h) == h)
3069 f. memory |= h
3170 return r
3271end
3372
34- function false_positive_rate (f:: BloomFilter{T,K} ) where {T,K}
73+ function _in! (x, f:: BloomFilterNode{T} ) where {T}
74+ h = _hash_bloom (x, f. seeds, T)
75+ r = ((f. memory & h) == h)
76+ f. memory |= h
77+ return r || _in (x, f. next)
78+ end
79+
80+ in! (x, f:: BloomFilterChain{T} ) where {T} = _in! (x, f. head)
81+
82+ _false_positive_rate (:: Nothing , k) = 0
83+ function _false_positive_rate (f:: BloomFilterNode{T} , k) where {T}
3584 pop = count_ones (f. memory)
3685 bits = 8 sizeof (T)
37- return (pop/ bits)^ K
86+
87+ p1 = (pop/ bits)^ k
88+ p2 = _false_positive_rate (f. next, k)
89+ return p1 + p2 - p1 * p2
3890end
3991
40- struct Unique
41- f:: Set
42- g
92+ _false_positive_rate (f:: BloomFilterChain{T,K} ) where {T,K} = _false_positive_rate (f. head, K)
93+
94+
95+
96+
97+
98+
99+
100+ myunique (it) = Unique {typeof(it)} (BloomFilterChain {UInt128,3} (), it)
101+ struct Unique{T}
102+ f:: BloomFilterChain{UInt128,3}
103+ g:: T
43104end
44105
45106function iterate (u:: Unique )
46107 x, next = iterate (u. g)
47- push! ( u. f, x )
108+ in! (x, u. f)
48109 return (x, next)
49110end
50111
0 commit comments