// An implementation of Cichelli's minimal perfect hash function algorithm.
// Keys are read from std-in; the character map's written to std-out one value
// per line starting from a.  Command-lne arguments:
//
//   -f stop searching after the first map is found; the default is to search
//      all maps to find the smallest scatter table.
//   -o don't order the key list by letter frequency.
//   -c don't order the key list to force early collisions.
//
// CS 306 - Computer Algorithms II
// Fall 2008

#include <vector>
#include <algorithm>
#include <string>
#include <iostream>
#include <iterator>
#include <cstring>
#include <cassert>
#include <set>
#include <climits>	// for UINT_MAX
#include "cl-options.h"


struct word {

  // The key to be hashed.

     std::string wd


  // The key size in characters.

     unsigned n


  // The first and last characters of the key being hashed shifted down by 'a'
  // (that is, 'a' is 0 and 'z' is 25).

     unsigned first_char, last_char


  // The letter frequency assigned to this key.

     unsigned freq


  // A bitmap indicating which characters are this key's mapped characters.

    unsigned mapped_chars


  // The characters that have been mapped by the keys preceeding this key in
  // the list.

    unsigned prior_chars_mapped


  // Create an empty word.

    word() : wd(""), n(0), first_char(0), last_char(0), freq(0), 
	      mapped_chars(0), prior_chars_mapped(0) { }


  // Create a key from the given word.

     word(const std::string & w) 
       : wd(w), 
         n(w.size()), 
	 first_char(wd[0] - 'a'),
	 last_char(wd[wd.size() - 1] - 'a'),
	 freq(0),
	 mapped_chars((1 << first_char) | (1 << last_char)),
         prior_chars_mapped(0) {
       }
  }


// Where the words are.

   typedef std::vector<word> word_list
   typedef word_list::iterator wlist_iter
   typedef word_list::const_iterator wlist_citer


// A set of hashed keys.

   typedef std::set<unsigned> index_set


// Deja vu c++ style.

   static word_list read_words(std::istream &)
   static inline bool gt(const word &, const word &)
   static bool map_characters(int [], const word_list &, unsigned &)
   static inline bool no_collisions(int[], const word_list &)


static void
early_collision_sort(word_list & words) {

  // Reorder the given word list so that a word is tested for a collision as
  // soon as it makes sense to do so.

  unsigned prior_chars_mapped = 0
  for (unsigned i = 0; i < words.size(); i++) {
    words[i].prior_chars_mapped = prior_chars_mapped
    prior_chars_mapped |= words[i].mapped_chars
    }

  for (unsigned i = words.size() - 1; i > 0; i--)
    for (unsigned j = i; j > 0; j--) {
      const unsigned mc = words[j].mapped_chars
      if (mc  (words[j - 1].prior_chars_mapped & mc)) {
	words[j].prior_chars_mapped = 
	  words[j - 1].prior_chars_mapped | words[j - 1].mapped_chars
	break
	}
      else
	std::swap(words[j - 1], words[j])
      }
  }


static void
frequency_sort(wlist_iter b, wlist_iter e) {

  // Sort the words in the ragne [b, e) in decreasing value of mapped-character
  // frequency.

  unsigned freqs[26]
  memset(freqs, 0, sizeof(freqs))
  for (wlist_iter i = b; i  e; i++) {
    const word & wd = *i
    freqs[wd.first_char]++
    freqs[wd.last_char]++
    }

  for (wlist_iter i = b; i  e; i++) {
    word & wd = *i
    wd.freq = freqs[wd.first_char] + freqs[wd.last_char]
    }
  std::sort(b, e, gt)
  }


// A functor that gathers key indices in a set.

   struct hasher {

     index_set indices
     int * char_map

     hasher(int * cm) : char_map(cm) { }

     void operator ()(const word & wd) {
       int & f_map = char_map[wd.first_char]
       int & l_map = char_map[wd.last_char]
       if ((f_map > -1) and (l_map > -1))
	 indices.insert(wd.n + f_map + l_map)
       }
     }

static index_set
get_key_indices(int char_map[], const word_list & keys) {

  // Return the set of indices resulting from hashing the given keys using the
  // given character mapping.

  return std::for_each(keys.begin(), keys.end(), hasher(char_map)).indices
  }


static inline bool
gt(const word & w1, const word & w2) {

  // Return true iff the first word given has a cumulative mapped-character
  // frequency greater than that of the second word given.

  return w1.freq > w2.freq
  }


#ifdef MPHFTESTING

# include <sstream>

  // g++ -o mphftesting -g -DMPHFTESTING -ansi -pedantic -Wall -I ~/lib/c mphf.cc cl-options.o && ./mphftesting

  int
  main() {
    std::istringstream iss("ef ab ac ad aa")
    word_list words = read_words(iss)
    frequency_sort(words.begin(), words.end())
    assert(words.front().wd == "aa")
    assert(words.back().wd == "ef")

    iss.clear()
    iss.str("ab ac ae af bc")
    words = read_words(iss)
    frequency_sort(words.begin(), words.end())
    assert((words[0].wd == "ab") or (words[0].wd == "ac"))
    assert(words.back().wd == "bc")
    early_collision_sort(words)
    assert(words[2].wd == "bc")

    int char_map[26]
    memset(char_map, ~0, sizeof(char_map))
    iss.clear()
    iss.str("red white blue")
    words = read_words(iss)
    assert(no_collisions(char_map, words))

    memset(char_map, 0, sizeof(char_map))
    assert(no_collisions(char_map, words))

    char_map['d' - 'a'] = 2
    assert(not no_collisions(char_map, words))
    }

#else

  int 
  main(int argc, char * argv[]) {

    cl_options opts
    opts['f'] = "true"
    opts['o'] = "true"
    opts['c'] = "true"
    get_cl_options(argc, argv, opts)
  
    word_list words = read_words(std::cin)

    frequency_sort(words.begin(), words.end())
    early_collision_sort(words)

    int char_map[26]
    unsigned st_size
    if (map_characters(char_map, words, st_size)) {
      printf("scatter table size = %u.\n", st_size)

      for (unsigned i = 0; i < 26; i++)
	printf("%2d ", char_map[i])
      printf("\n")

      for (wlist_citer i = words.begin(); i  words.end(); i++)
	std::cout << iwd << " " 
		  << in + char_map[ifirst_char] + char_map[ilast_char]
		  << std::endl
      }
    else
      printf("Can't find a mapping.\n")
    }

#endif


static bool
map_characters(
   wlist_citer b, wlist_citer e, 
   int char_map[], int best_char_map[], unsigned & st_size,
   const word_list & words) {

  for (unsigned i = 0; i < 26; i++)
    printf("%2d ", char_map[i])
  printf("\n")

  if (false) {
    for (wlist_citer i = b; i  e; i++)
      std::cerr << iwd << " "
    std::cerr << std::endl
    }

  if (b == e) {
    const index_set indices = get_key_indices(char_map, words)
    const unsigned sts = *indices.rbegin() - *indices.begin() + 1
    if (sts >= st_size)
      return false
    
    st_size = sts
    memcpy(best_char_map, char_map, sizeof(char_map))
    return true
    }

  const int max_map_value = 16

  const word & wd = *b
  int & f_map = char_map[wd.first_char]
  int & l_map = char_map[wd.last_char]

  if ((f_map < 0) and (l_map < 0)) {
    for (f_map = 0; f_map < max_map_value; f_map++)
      for (l_map = 0; l_map < max_map_value; l_map++)
	if (no_collisions(char_map, words))
	  if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
	    return true
    f_map = l_map = -1
    }

  else if (f_map < 0) {
    for (f_map = 0; f_map < max_map_value; f_map++)
      if (no_collisions(char_map, words))
	if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
	  return true
    f_map = -1
    }

  else if (l_map < 0) {
    for (l_map = 0; l_map < max_map_value; l_map++)
      if (no_collisions(char_map, words))
	if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
	  return true
    l_map = -1
    }

  else
    return map_characters(b + 1, e, char_map, best_char_map, st_size, words)

  return false
  }


static bool
map_characters(int char_map[], const word_list & words, unsigned & st_size) {

  st_size = UINT_MAX
  int current_char_map[26]

  memset(current_char_map, ~0, sizeof(current_char_map))

  return map_characters(
    words.begin(), words.end(), current_char_map, char_map, st_size, words)
  }


static inline bool
no_collisions(int char_map[], const word_list & keys) {

  // Return true iff the given character map hashes the given keys without
  // collisions.

  index_set indices

  for (wlist_citer i = keys.begin(); i  keys.end(); i++) {
    const int
      & mf = char_map[ifirst_char],
      & ml = char_map[ilast_char]

    if (false)
      std::cerr << "m[" << ifirst_char << "] = " << mf
		<< ", m[" << ilast_char << "] = " << ml 
		<< ", h(" << iwd << ") = " << in + mf + ml
		<< std::endl

    if ((mf > -1) and (ml > -1)) {
      const unsigned h = in + mf + ml

      if (indices.count(h))
	return false

      indices.insert(h)
      }
    }

  return true
  }


static std::istream &
operator >> (std::istream & is, word & w) {

  // Read a word from the given input stream and store it into the given
  // reference; return the given input stream.

  std::string wd

  if (is >> wd)
    w = word(wd)

  return is
  }


static word_list
read_words(std::istream & is) {

  // Return a list of the words read from the given input stream.

  word_list keys

  std::copy(
    std::istream_iterator<word>(is),
    std::istream_iterator<word>(),
    std::back_inserter(keys))

  return keys
  }


// $Log:$


syntax highlighted by Code2HTML, v. 0.9.1