// Definition and use of a simple Bloom filter for text.  Example use:
//
//   $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter.  Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
//   -b n - use n buckets in the Bloom filter; default n = 800,000.
//   -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 503 - Advanced Programming I, Spring 2006

#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"

class bit_vector {

  public:

    // Create a new bit vector of the given number of bits; all bits are
    // set to 0.

       explicit bit_vector(unsigned size) 
         : bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]), 
	   max(size) { 
	 memset(bytes, 0, size/8);
	 }


    // Delete this bit vector.

      ~bit_vector() {
  	 delete [] bytes;
	 }


    // Return true iff the bit at the given index is 1; die if the bit index
    // isn't valid.

       bool get(unsigned i) const {
	 assert(i < max);
	 return bytes[i/8] & (1 << (i % 8));
	 }


    // Set the bit at the given index to 1; die if the bit index isn't
    // valid.

       void set(unsigned i) {
	 assert(i < max);
	 bytes[i/8] |= (1 << (i % 8));
	 }


    // Return the number of indexable bits in this vector.

       unsigned size() const { 
	 return max; 
	 }

  private:

    unsigned char * bytes;
    unsigned max;

    bit_vector & operator = (const bit_vector &) { return *this; }
    bit_vector(const bit_vector &) : bytes(0), max(0) { }
  };


class bloom_filter {

  public:

    // Create a new Bloom filter with the given number of bits and the given
    // degree; the new filter is empty.

       explicit bloom_filter(unsigned size, unsigned d = 1) 
	 : p(next_prime(size)), bits(p), filled(0), degree(d) {
	 assert((0 < d) && (d <= prime_count));
	 }


    // Add the given word to this Bloom filter.

       void add(const std::string & word) {
	 for (unsigned d = 1; d <= degree; d++) {
	   const unsigned h = hash(d, word);
	   if (not bits.get(h)) {
	     filled++;
	     bits.set(h);
  	     }
	   }
	 }


    // Return true iff the given word is in this Bloom filter.  Collisions may
    // cause false positives.

       bool find(const std::string & word) const {
	 bool found = true;
	 for (unsigned d = 1; found and (d <= degree); d++)
	   found = found and bits.get(hash(d, word));
	 return found;
	 }


    // Retrn this filter's load factor.

       double load_factor() const {
	 return static_cast<double>(filled)/bits.size();
	 }


  private:

    // The smallest prime larger then the requsted filter size.

       unsigned p;


    // The bloom filter.

       bit_vector bits;


    // The number of full buckets.

       unsigned filled;


    // The number of times a word should be hashed into the filter.

       unsigned degree;


    unsigned h(unsigned d, const std::string & word) const {

      // Hash, using the given degree, the given string into an unsigned int.

      unsigned h = 0;
      const unsigned p = primes[d - 1];

      for (unsigned i = 0; i < word.size(); i++)
	h = p*h + word[i];
      return h;
      }


    // An array of primes for use in h().

       static const unsigned primes[];


    // The number of available primes.

       static const unsigned prime_count;


    unsigned hash(unsigned d, const std::string & word) const {


      // Hash, using the given degree, the given unsigned int into a bit index.

      assert(d <= degree);
      return h(d, word) % p;
      }


    unsigned next_prime(unsigned i) const {
    
      // Return the smallest prime larger than the given integer.

      const unsigned max = i*2;
      bit_vector sieve(max);

      for (unsigned j = 3; j < i; j += 2)
	if (not sieve.get(j)) 
	  for (unsigned k = j; k < max; k += j)
	    sieve.set(k);

      if (not (i & 1)) i++;
      while (sieve.get(i) and (i < max)) i += 2;
      assert(i < max);

      return i;
      }
  };


const unsigned 
bloom_filter::primes[] = {
  101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199 
  };

const unsigned 
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned);



int main(int argc, const char * const argv[]) {

  cl_options opts;
  opts['b'] = "800000";
  opts['d'] = "1";
  unsigned o = get_cl_options(argc, argv, opts);

  if (o + 1 != argc) {
    std::cerr 
      << "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n";
    exit(EXIT_FAILURE);
    }
  std::ifstream ifs(argv[o]);
  if (!ifs) {
    std::cerr << "Failure opening " << argv[o] << ".\n";
    exit(EXIT_FAILURE);
    }

  std::string word;
  bloom_filter bf(atoi(opts['b']), atoi(opts['d']));

  while (ifs >> word)
    bf.add(word);

  ifs.close();

  std::cerr << "load factor:  " << bf.load_factor() << "\n";

  while (std::cin >> word)
    if (not bf.find(word))
      std::cout << word << "\n";
  }


// $Log: bf.cc,v $
// Revision 1.1  2006-04-13 00:53:12  rclayton
// Created.
//


syntax highlighted by Code2HTML, v. 0.9.1