|
| 1 | +using BitIntegers |
| 2 | + |
| 3 | +using Base: count_ones |
| 4 | +import Base: in!, iterate, IteratorSize |
| 5 | + |
| 6 | +function _hash_bloom(x, seeds, T) |
| 7 | + out = zero(T) |
| 8 | + for h in seeds |
| 9 | + out |= one(T)<<(hash(x, h) % (8sizeof(T))) |
| 10 | + end |
| 11 | + return out |
| 12 | +end |
| 13 | + |
| 14 | +mutable struct BloomFilter{T,K} |
| 15 | + memory::T |
| 16 | + const seeds::NTuple{K, UInt64} |
| 17 | + const p::Float64 # max tolerable false positive rate |
| 18 | + function BloomFilter{T,K}(p, seeds) where {T<:Unsigned,K} |
| 19 | + new{T,K}(zero(T), seeds, p) |
| 20 | + end |
| 21 | +end |
| 22 | + |
| 23 | +function BloomFilter(; k=5, p=1/16) |
| 24 | + return BloomFilter{UInt64, k}(p, ntuple(_ -> rand(UInt), k)) |
| 25 | +end |
| 26 | + |
| 27 | +function in!(x, f::BloomFilter{T}) where {T} |
| 28 | + h = _hash_bloom(x, f.seeds, T) |
| 29 | + r = (f.memory & h) == h |
| 30 | + f.memory |= h |
| 31 | + return r |
| 32 | +end |
| 33 | + |
| 34 | +function false_positive_rate(f::BloomFilter{T,K}) where {T,K} |
| 35 | + pop = count_ones(f.memory) |
| 36 | + bits = 8sizeof(T) |
| 37 | + return (pop/bits)^K |
| 38 | +end |
| 39 | + |
| 40 | +struct Unique |
| 41 | + f::Set |
| 42 | + g |
| 43 | +end |
| 44 | + |
| 45 | +function iterate(u::Unique) |
| 46 | + x, next = iterate(u.g) |
| 47 | + push!(u.f, x) |
| 48 | + return (x, next) |
| 49 | +end |
| 50 | + |
| 51 | +function iterate(u::Unique, state) |
| 52 | + it = iterate(u.g, state) |
| 53 | + if isnothing(it) |
| 54 | + return nothing |
| 55 | + else |
| 56 | + x, next = it |
| 57 | + p = in!(x, u.f) |
| 58 | + if !p # definitely unique |
| 59 | + return (x, next) |
| 60 | + else |
| 61 | + return iterate(u, next) |
| 62 | + end |
| 63 | + end |
| 64 | +end |
| 65 | + |
| 66 | +IteratorSize(::Unique) = Base.SizeUnknown() |
0 commit comments