// Definition and use of a simple Bloom filter for text.  Example use:
//
//   $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter.  Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
//   -b n - use n buckets in the Bloom filter; default n = 800,000.
//   -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 306 - Computer Algorithms II, Fall 2008

#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"

class bit_vector {

  public:

    // Create a new bit vector of the given number of bits; all bits are
    // set to 0.

       explicit bit_vector(unsigned size) 
         : bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]), 
	   max(size) { 
	 memset(bytes, 0, size/8)
	 }


    // Delete this bit vector.

      ~bit_vector() {
  	 delete [] bytes
	 }


    // Return true iff the bit at the given index is 1; die if the bit index
    // isn't valid.

       bool get(unsigned i) const {
	 assert(i < max)
	 return bytes[i/8] & (1 << (i % 8))
	 }


    // Set the bit at the given index to 1; die if the bit index isn't
    // valid.

       void set(unsigned i) {
	 assert(i < max)
	 bytes[i/8] |= (1 << (i % 8))
	 }


    // Return the number of indexable bits in this vector.

       unsigned size() const { 
	 return max; 
	 }

  private:

    unsigned char * bytes
    unsigned max

    bit_vector & operator = (const bit_vector &) { return *this; }
    bit_vector(const bit_vector &) : bytes(0), max(0) { }
  }


class bloom_filter {

  public:

    // Create a new Bloom filter with the given number of bits and the given
    // degree; the new filter is empty.

       explicit bloom_filter(unsigned size, unsigned d = 1) 
	 : p(next_prime(size)), bits(p), filled(0), degree(d) {
	 assert((0 < d) && (d  prime_count))
	 }


    // Add the given word to this Bloom filter.

       void add(const std::string & word) {
	 for (unsigned d = 1; d  degree; d++) {
	   const unsigned h = hash(d, word)
	   if (not bits.get(h)) {
	     filled++
	     bits.set(h)
  	     }
	   }
	 }


    // Return true iff the given word is in this Bloom filter.  Collisions may
    // cause false positives.

       bool find(const std::string & word) const {
	 bool found = true
	 for (unsigned d = 1; found and (d  degree); d++)
	   found = found and bits.get(hash(d, word))
	 return found
	 }


    // Retrn this filter's load factor.

       double load_factor() const {
	 return static_cast<double>(filled)/bits.size()
	 }


  private:

    // The smallest prime larger then the requsted filter size.

       unsigned p


    // The bloom filter.

       bit_vector bits


    // The number of full buckets.

       unsigned filled


    // The number of times a word should be hashed into the filter.

       unsigned degree


    unsigned h(unsigned d, const std::string & word) const {

      // Hash, using the given degree, the given string into an unsigned int.

      unsigned h = 0
      const unsigned p = primes[d - 1]

      for (unsigned i = 0; i < word.size(); i++)
	h = p*h + word[i]
      return h
      }


    // An array of primes for use in h().

       static const unsigned primes[]


    // The number of available primes.

       static const unsigned prime_count


    unsigned hash(unsigned d, const std::string & word) const {


      // Hash, using the given degree, the given unsigned int into a bit index.

      assert(d  degree)
      return h(d, word) % p
      }


    unsigned next_prime(unsigned i) const {
    
      // Return the smallest prime larger than the given integer.

      const unsigned max = i*2
      bit_vector sieve(max)

      for (unsigned j = 3; j < i; j += 2)
	if (not sieve.get(j)) 
	  for (unsigned k = j; k < max; k += j)
	    sieve.set(k)

      if (not (i & 1)) i++
      while (sieve.get(i) and (i < max)) i += 2
      assert(i < max)

      return i
      }
  }


const unsigned 
bloom_filter::primes[] = {
  101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199 
  }

const unsigned 
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned)



int main(int argc, const char * const argv[]) {

  cl_options opts
  opts['b'] = "800000"
  opts['d'] = "1"
  unsigned o = get_cl_options(argc, argv, opts)

  if (o + 1  argc) {
    std::cerr 
      << "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n"
    exit(EXIT_FAILURE)
    }
  std::ifstream ifs(argv[o])
  if (!ifs) {
    std::cerr << "Failure opening " << argv[o] << ".\n"
    exit(EXIT_FAILURE)
    }

  std::string word
  bloom_filter bf(atoi(opts['b']), atoi(opts['d']))

  while (ifs >> word)
    bf.add(word)

  ifs.close()

  std::cerr << "load factor:  " << bf.load_factor() << "\n"

  while (std::cin >> word)
    if (not bf.find(word))
      std::cout << word << "\n"
  }


// $Log: bf.cc,v $
// Revision 1.1  2006-04-13 00:53:12  rclayton
// Created.
//


syntax highlighted by Code2HTML, v. 0.9.1