// Definition and use of a simple Bloom filter for text. Example use:
//
// $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter. Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
// -b n - use n buckets in the Bloom filter; default n = 800,000.
// -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 503 - Advanced Programming I, Spring 2006
#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"
class bit_vector {
public:
// Create a new bit vector of the given number of bits; all bits are
// set to 0.
explicit bit_vector(unsigned size)
: bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]),
max(size) {
memset(bytes, 0, size/8);
}
// Delete this bit vector.
~bit_vector() {
delete [] bytes;
}
// Return true iff the bit at the given index is 1; die if the bit index
// isn't valid.
bool get(unsigned i) const {
assert(i < max);
return bytes[i/8] & (1 << (i % 8));
}
// Set the bit at the given index to 1; die if the bit index isn't
// valid.
void set(unsigned i) {
assert(i < max);
bytes[i/8] |= (1 << (i % 8));
}
// Return the number of indexable bits in this vector.
unsigned size() const {
return max;
}
private:
unsigned char * bytes;
unsigned max;
bit_vector & operator = (const bit_vector &) { return *this; }
bit_vector(const bit_vector &) : bytes(0), max(0) { }
};
class bloom_filter {
public:
// Create a new Bloom filter with the given number of bits and the given
// degree; the new filter is empty.
explicit bloom_filter(unsigned size, unsigned d = 1)
: p(next_prime(size)), bits(p), filled(0), degree(d) {
assert((0 < d) && (d <= prime_count));
}
// Add the given word to this Bloom filter.
void add(const std::string & word) {
for (unsigned d = 1; d <= degree; d++) {
const unsigned h = hash(d, word);
if (not bits.get(h)) {
filled++;
bits.set(h);
}
}
}
// Return true iff the given word is in this Bloom filter. Collisions may
// cause false positives.
bool find(const std::string & word) const {
bool found = true;
for (unsigned d = 1; found and (d <= degree); d++)
found = found and bits.get(hash(d, word));
return found;
}
// Retrn this filter's load factor.
double load_factor() const {
return static_cast<double>(filled)/bits.size();
}
private:
// The smallest prime larger then the requsted filter size.
unsigned p;
// The bloom filter.
bit_vector bits;
// The number of full buckets.
unsigned filled;
// The number of times a word should be hashed into the filter.
unsigned degree;
unsigned h(unsigned d, const std::string & word) const {
// Hash, using the given degree, the given string into an unsigned int.
unsigned h = 0;
const unsigned p = primes[d - 1];
for (unsigned i = 0; i < word.size(); i++)
h = p*h + word[i];
return h;
}
// An array of primes for use in h().
static const unsigned primes[];
// The number of available primes.
static const unsigned prime_count;
unsigned hash(unsigned d, const std::string & word) const {
// Hash, using the given degree, the given unsigned int into a bit index.
assert(d <= degree);
return h(d, word) % p;
}
unsigned next_prime(unsigned i) const {
// Return the smallest prime larger than the given integer.
const unsigned max = i*2;
bit_vector sieve(max);
for (unsigned j = 3; j < i; j += 2)
if (not sieve.get(j))
for (unsigned k = j; k < max; k += j)
sieve.set(k);
if (not (i & 1)) i++;
while (sieve.get(i) and (i < max)) i += 2;
assert(i < max);
return i;
}
};
const unsigned
bloom_filter::primes[] = {
101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199
};
const unsigned
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned);
int main(int argc, const char * const argv[]) {
cl_options opts;
opts['b'] = "800000";
opts['d'] = "1";
unsigned o = get_cl_options(argc, argv, opts);
if (o + 1 != argc) {
std::cerr
<< "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n";
exit(EXIT_FAILURE);
}
std::ifstream ifs(argv[o]);
if (!ifs) {
std::cerr << "Failure opening " << argv[o] << ".\n";
exit(EXIT_FAILURE);
}
std::string word;
bloom_filter bf(atoi(opts['b']), atoi(opts['d']));
while (ifs >> word)
bf.add(word);
ifs.close();
std::cerr << "load factor: " << bf.load_factor() << "\n";
while (std::cin >> word)
if (not bf.find(word))
std::cout << word << "\n";
}
// $Log: bf.cc,v $
// Revision 1.1 2006-04-13 00:53:12 rclayton
// Created.
//
syntax highlighted by Code2HTML, v. 0.9.1