// Definition and use of a simple Bloom filter for text.  Example use:
//
//   $ ./bloomf dict < text
//
// The words in the dictionary are read into a Bloom filter.  Any words from
// std-in that can't be found in the Bloom filter are written to std-out.
// Command-line options are:
//
//   -b n - use n buckets in the Bloom filter; default n = 800,000.
//   -d d - use a d-degree Bloom filter; default d = 1.
//
// CS 306 - Computer Algorithms II, Fall 2007

#include <string>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cassert>
#include <vector>
#include "cl-options.h"

class bit_vector

  public:

    // Create a new bit vector of the given number of bits; all bits are
    // set to 0.

       explicit bit_vector(unsigned size) 
         : bytes(new unsigned char [size/8 + ((size % 8) ? 1 : 0)]), 
	   max(size)
	 memset(bytes, 0, size/8)


    // Delete this bit vector.

      ~bit_vector()
  	 delete [] bytes


    // Return true iff the bit at the given index is 1; die if the bit index
    // isn't valid.

       bool get(unsigned i) const
	 return bref(i) & bshift(i)


    // Set the bit at the given index to 1; die if the bit index isn't
    // valid.

       void set(unsigned i)
	 bref(i) |= bshift(i)


    // Return the number of indexable bits in this vector.

       unsigned size() const
	 return max

  private:

    unsigned char * bytes
    unsigned max

    inline unsigned char & bref(unsigned i) const
      assert(i < max)
      return bytes[i/8]

    inline unsigned bshift(unsigned i) const
      return 1 << (i % 8)



class bloom_filter

  public:

    // Create a new Bloom filter with the given number of bits and the given
    // degree; the new filter is empty.

       explicit bloom_filter(unsigned size, unsigned d = 1) 
	 : p(next_prime(size)), bits(p), filled(0), degree(d)
	 assert((0 < d) && (d <= prime_count))


    // Add the given word to this Bloom filter.

       void add(const std::string & word)
	 for unsigned d = 1; d <= degree; d++
	   const unsigned h = hash(d, word)
	   if not bits.get(h)
	     filled++
	     bits.set(h)


    // Return true iff the given word is in this Bloom filter.  Collisions may
    // cause false positives.

       bool find(const std::string & word) const
	 bool found = true
	 for unsigned d = 1; found and (d <= degree); d++
	   found = found and bits.get(hash(d, word))
	 return found


    // Retrn this filter's load factor.

       double load_factor() const
	 return static_cast<double>(filled)/bits.size()


  private:

    // The smallest prime larger then the requsted filter size.

       unsigned p


    // The bloom filter.

       bit_vector bits


    // The number of full buckets.

       unsigned filled


    // The number of times a word should be hashed into the filter.

       unsigned degree


    unsigned h(unsigned d, const std::string & word) const

      // Hash, using the given degree, the given string into an unsigned int.

      unsigned h = 0
      const unsigned p = primes[d - 1]

      for unsigned i = 0; i < word.size(); i++
	h = p*h + word[i]
      return h


    // An array of primes for use in h().

       static const unsigned primes[]


    // The number of available primes.

       static const unsigned prime_count


    unsigned hash(unsigned d, const std::string & word) const


      // Hash, using the given degree, the given unsigned int into a bit index.

      assert(d <= degree)
      return h(d, word) % p


    unsigned next_prime(unsigned i) const
    
      // Return the smallest prime larger than the given integer.

      const unsigned max = i*2
      bit_vector sieve(max)

      for unsigned j = 3; j < i; j += 2
	if not sieve.get(j) 
	  for unsigned k = j; k < max; k += j
	    sieve.set(k)

      if not (i & 1) i++
      while sieve.get(i) and (i < max) i += 2
      assert(i < max)

      return i


const unsigned 
bloom_filter::primes[] =
  101, 107, 113, 131, 139, 151, 163, 173, 181, 193, 199 

const unsigned 
bloom_filter::prime_count = sizeof(primes)/sizeof(unsigned)



int main(int argc, const char * const argv[])

  cl_options opts
  opts['b'] = "800000"
  opts['d'] = "1"
  unsigned o = get_cl_options(argc, argv, opts)

  if o + 1 != static_cast<unsigned>(argc)
    std::cerr 
      << "Command format is \"" << argv[0] << " [-d n] [-b n] dictionary\".\n"
    exit(EXIT_FAILURE)
  std::ifstream ifs(argv[o])
  if !ifs
    std::cerr << "Failure opening " << argv[o] << ".\n"
    exit(EXIT_FAILURE)

  std::string word
  bloom_filter bf(atoi(opts['b']), atoi(opts['d']))

  while ifs >> word
    bf.add(word)

  ifs.close()

  std::cerr << "load factor:  " << bf.load_factor() << "\n"

  while std::cin >> word
    if not bf.find(word)
      std::cout << word << "\n"


// $Log: bf.cc,v $
// Revision 1.1  2006-04-13 00:53:12  rclayton
// Created.
//


syntax highlighted by Code2HTML, v. 0.9.1