// Definition and use of a simple Bloom filter for text. Example use:
//
// $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter. Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
// -b n - use n buckets in the Bloom filter; default n = 800,000.
// -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 306 - Computer Algorithms II, Fall 2008
#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"
class bit_vector {
public:
// Create a new bit vector of the given number of bits; all bits are
// set to 0.
explicit bit_vector(unsigned size)
: bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]),
max(size) {
memset(bytes, 0, size/8)
}
// Delete this bit vector.
~bit_vector() {
delete [] bytes
}
// Return true iff the bit at the given index is 1; die if the bit index
// isn't valid.
bool get(unsigned i) const {
assert(i < max)
return bytes[i/8] & (1 << (i % 8))
}
// Set the bit at the given index to 1; die if the bit index isn't
// valid.
void set(unsigned i) {
assert(i < max)
bytes[i/8] |= (1 << (i % 8))
}
// Return the number of indexable bits in this vector.
unsigned size() const {
return max;
}
private:
unsigned char * bytes
unsigned max
bit_vector & operator = (const bit_vector &) { return *this; }
bit_vector(const bit_vector &) : bytes(0), max(0) { }
}
class bloom_filter {
public:
// Create a new Bloom filter with the given number of bits and the given
// degree; the new filter is empty.
explicit bloom_filter(unsigned size, unsigned d = 1)
: p(next_prime(size)), bits(p), filled(0), degree(d) {
assert((0 < d) && (d ≤ prime_count))
}
// Add the given word to this Bloom filter.
void add(const std::string & word) {
for (unsigned d = 1; d ≤ degree; d++) {
const unsigned h = hash(d, word)
if (not bits.get(h)) {
filled++
bits.set(h)
}
}
}
// Return true iff the given word is in this Bloom filter. Collisions may
// cause false positives.
bool find(const std::string & word) const {
bool found = true
for (unsigned d = 1; found and (d ≤ degree); d++)
found = found and bits.get(hash(d, word))
return found
}
// Retrn this filter's load factor.
double load_factor() const {
return static_cast<double>(filled)/bits.size()
}
private:
// The smallest prime larger then the requsted filter size.
unsigned p
// The bloom filter.
bit_vector bits
// The number of full buckets.
unsigned filled
// The number of times a word should be hashed into the filter.
unsigned degree
unsigned h(unsigned d, const std::string & word) const {
// Hash, using the given degree, the given string into an unsigned int.
unsigned h = 0
const unsigned p = primes[d - 1]
for (unsigned i = 0; i < word.size(); i++)
h = p*h + word[i]
return h
}
// An array of primes for use in h().
static const unsigned primes[]
// The number of available primes.
static const unsigned prime_count
unsigned hash(unsigned d, const std::string & word) const {
// Hash, using the given degree, the given unsigned int into a bit index.
assert(d ≤ degree)
return h(d, word) % p
}
unsigned next_prime(unsigned i) const {
// Return the smallest prime larger than the given integer.
const unsigned max = i*2
bit_vector sieve(max)
for (unsigned j = 3; j < i; j += 2)
if (not sieve.get(j))
for (unsigned k = j; k < max; k += j)
sieve.set(k)
if (not (i & 1)) i++
while (sieve.get(i) and (i < max)) i += 2
assert(i < max)
return i
}
}
const unsigned
bloom_filter::primes[] = {
101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199
}
const unsigned
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned)
int main(int argc, const char * const argv[]) {
cl_options opts
opts['b'] = "800000"
opts['d'] = "1"
unsigned o = get_cl_options(argc, argv, opts)
if (o + 1 ≠ argc) {
std::cerr
<< "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n"
exit(EXIT_FAILURE)
}
std::ifstream ifs(argv[o])
if (!ifs) {
std::cerr << "Failure opening " << argv[o] << ".\n"
exit(EXIT_FAILURE)
}
std::string word
bloom_filter bf(atoi(opts['b']), atoi(opts['d']))
while (ifs >> word)
bf.add(word)
ifs.close()
std::cerr << "load factor: " << bf.load_factor() << "\n"
while (std::cin >> word)
if (not bf.find(word))
std::cout << word << "\n"
}
// $Log: bf.cc,v $
// Revision 1.1 2006-04-13 00:53:12 rclayton
// Created.
//
syntax highlighted by Code2HTML, v. 0.9.1