// Definition and use of a simple Bloom filter for text. Example use:
//
// $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter. Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
// -b n - use n buckets in the Bloom filter; default n = 800,000.
// -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 306 - Computer Algorithms II, Fall 2007
#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"
class bit_vector
public:
// Create a new bit vector of the given number of bits; all bits are
// set to 0.
explicit bit_vector(unsigned size)
: bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]),
max(size)
memset(bytes, 0, size/8)
// Delete this bit vector.
~bit_vector()
delete [] bytes
// Return true iff the bit at the given index is 1; die if the bit index
// isn't valid.
bool get(unsigned i) const
return bref(i) & bshift(i)
// Set the bit at the given index to 1; die if the bit index isn't
// valid.
void set(unsigned i)
bref(i) |= bshift(i)
// Return the number of indexable bits in this vector.
unsigned size() const
return max
private:
unsigned char * bytes
unsigned max
inline unsigned char & bref(unsigned i) const
assert(i < max)
return bytes[i/8]
inline unsigned bshift(unsigned i) const
return 1 << (i % 8)
class bloom_filter
public:
// Create a new Bloom filter with the given number of bits and the given
// degree; the new filter is empty.
explicit bloom_filter(unsigned size, unsigned d = 1)
: p(next_prime(size)), bits(p), filled(0), degree(d)
assert((0 < d) && (d <= prime_count))
// Add the given word to this Bloom filter.
void add(const std::string & word)
for unsigned d = 1; d <= degree; d++
const unsigned h = hash(d, word)
if not bits.get(h)
filled++
bits.set(h)
// Return true iff the given word is in this Bloom filter. Collisions may
// cause false positives.
bool find(const std::string & word) const
bool found = true
for unsigned d = 1; found and (d <= degree); d++
found = found and bits.get(hash(d, word))
return found
// Retrn this filter's load factor.
double load_factor() const
return static_cast<double>(filled)/bits.size()
private:
// The smallest prime larger then the requsted filter size.
unsigned p
// The bloom filter.
bit_vector bits
// The number of full buckets.
unsigned filled
// The number of times a word should be hashed into the filter.
unsigned degree
unsigned h(unsigned d, const std::string & word) const
// Hash, using the given degree, the given string into an unsigned int.
unsigned h = 0
const unsigned p = primes[d - 1]
for unsigned i = 0; i < word.size(); i++
h = p*h + word[i]
return h
// An array of primes for use in h().
static const unsigned primes[]
// The number of available primes.
static const unsigned prime_count
unsigned hash(unsigned d, const std::string & word) const
// Hash, using the given degree, the given unsigned int into a bit index.
assert(d <= degree)
return h(d, word) % p
unsigned next_prime(unsigned i) const
// Return the smallest prime larger than the given integer.
const unsigned max = i*2
bit_vector sieve(max)
for unsigned j = 3; j < i; j += 2
if not sieve.get(j)
for unsigned k = j; k < max; k += j
sieve.set(k)
if not (i & 1) i++
while sieve.get(i) and (i < max) i += 2
assert(i < max)
return i
const unsigned
bloom_filter::primes[] =
101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199
const unsigned
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned)
int main(int argc, const char * const argv[])
cl_options opts
opts['b'] = "800000"
opts['d'] = "1"
unsigned o = get_cl_options(argc, argv, opts)
if o + 1 != static_cast<unsigned>(argc)
std::cerr
<< "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n"
exit(EXIT_FAILURE)
std::ifstream ifs(argv[o])
if !ifs
std::cerr << "Failure opening " << argv[o] << ".\n"
exit(EXIT_FAILURE)
std::string word
bloom_filter bf(atoi(opts['b']), atoi(opts['d']))
while ifs >> word
bf.add(word)
ifs.close()
std::cerr << "load factor: " << bf.load_factor() << "\n"
while std::cin >> word
if not bf.find(word)
std::cout << word << "\n"
// $Log: bf.cc,v $
// Revision 1.1 2006-04-13 00:53:12 rclayton
// Created.
//
syntax highlighted by Code2HTML, v. 0.9.1