// An implementation of Cichelli's minimal perfect hash function algorithm.
// Keys are read from std-in; the character map's written to std-out one value
// per line starting from a. Command-lne arguments:
//
// -f stop searching after the first map is found; the default is to search
// all maps to find the smallest scatter table.
// -o don't order the key list by letter frequency.
// -c don't order the key list to force early collisions.
//
// CS 306 - Computer Algorithms II
// Fall 2008
#include <vector>
#include <algorithm>
#include <string>
#include <iostream>
#include <iterator>
#include <cstring>
#include <cassert>
#include <set>
#include <climits> // for UINT_MAX
#include "cl-options.h"
struct word {
// The key to be hashed.
std::string wd
// The key size in characters.
unsigned n
// The first and last characters of the key being hashed shifted down by 'a'
// (that is, 'a' is 0 and 'z' is 25).
unsigned first_char, last_char
// The letter frequency assigned to this key.
unsigned freq
// A bitmap indicating which characters are this key's mapped characters.
unsigned mapped_chars
// The characters that have been mapped by the keys preceeding this key in
// the list.
unsigned prior_chars_mapped
// Create an empty word.
word() : wd(""), n(0), first_char(0), last_char(0), freq(0),
mapped_chars(0), prior_chars_mapped(0) { }
// Create a key from the given word.
word(const std::string & w)
: wd(w),
n(w.size()),
first_char(wd[0] - 'a'),
last_char(wd[wd.size() - 1] - 'a'),
freq(0),
mapped_chars((1 << first_char) | (1 << last_char)),
prior_chars_mapped(0) {
}
}
// Where the words are.
typedef std::vector<word> word_list
typedef word_list::iterator wlist_iter
typedef word_list::const_iterator wlist_citer
// A set of hashed keys.
typedef std::set<unsigned> index_set
// Deja vu c++ style.
static word_list read_words(std::istream &)
static inline bool gt(const word &, const word &)
static bool map_characters(int [], const word_list &, unsigned &)
static inline bool no_collisions(int[], const word_list &)
static void
early_collision_sort(word_list & words) {
// Reorder the given word list so that a word is tested for a collision as
// soon as it makes sense to do so.
unsigned prior_chars_mapped = 0
for (unsigned i = 0; i < words.size(); i++) {
words[i].prior_chars_mapped = prior_chars_mapped
prior_chars_mapped |= words[i].mapped_chars
}
for (unsigned i = words.size() - 1; i > 0; i--)
for (unsigned j = i; j > 0; j--) {
const unsigned mc = words[j].mapped_chars
if (mc ≠ (words[j - 1].prior_chars_mapped & mc)) {
words[j].prior_chars_mapped =
words[j - 1].prior_chars_mapped | words[j - 1].mapped_chars
break
}
else
std::swap(words[j - 1], words[j])
}
}
static void
frequency_sort(wlist_iter b, wlist_iter e) {
// Sort the words in the ragne [b, e) in decreasing value of mapped-character
// frequency.
unsigned freqs[26]
memset(freqs, 0, sizeof(freqs))
for (wlist_iter i = b; i ≠ e; i++) {
const word & wd = *i
freqs[wd.first_char]++
freqs[wd.last_char]++
}
for (wlist_iter i = b; i ≠ e; i++) {
word & wd = *i
wd.freq = freqs[wd.first_char] + freqs[wd.last_char]
}
std::sort(b, e, gt)
}
// A functor that gathers key indices in a set.
struct hasher {
index_set indices
int * char_map
hasher(int * cm) : char_map(cm) { }
void operator ()(const word & wd) {
int & f_map = char_map[wd.first_char]
int & l_map = char_map[wd.last_char]
if ((f_map > -1) and (l_map > -1))
indices.insert(wd.n + f_map + l_map)
}
}
static index_set
get_key_indices(int char_map[], const word_list & keys) {
// Return the set of indices resulting from hashing the given keys using the
// given character mapping.
return std::for_each(keys.begin(), keys.end(), hasher(char_map)).indices
}
static inline bool
gt(const word & w1, const word & w2) {
// Return true iff the first word given has a cumulative mapped-character
// frequency greater than that of the second word given.
return w1.freq > w2.freq
}
#ifdef MPHFTESTING
# include <sstream>
// g++ -o mphftesting -g -DMPHFTESTING -ansi -pedantic -Wall -I ~/lib/c mphf.cc cl-options.o && ./mphftesting
int
main() {
std::istringstream iss("ef ab ac ad aa")
word_list words = read_words(iss)
frequency_sort(words.begin(), words.end())
assert(words.front().wd == "aa")
assert(words.back().wd == "ef")
iss.clear()
iss.str("ab ac ae af bc")
words = read_words(iss)
frequency_sort(words.begin(), words.end())
assert((words[0].wd == "ab") or (words[0].wd == "ac"))
assert(words.back().wd == "bc")
early_collision_sort(words)
assert(words[2].wd == "bc")
int char_map[26]
memset(char_map, ~0, sizeof(char_map))
iss.clear()
iss.str("red white blue")
words = read_words(iss)
assert(no_collisions(char_map, words))
memset(char_map, 0, sizeof(char_map))
assert(no_collisions(char_map, words))
char_map['d' - 'a'] = 2
assert(not no_collisions(char_map, words))
}
#else
int
main(int argc, char * argv[]) {
cl_options opts
opts['f'] = "true"
opts['o'] = "true"
opts['c'] = "true"
get_cl_options(argc, argv, opts)
word_list words = read_words(std::cin)
frequency_sort(words.begin(), words.end())
early_collision_sort(words)
int char_map[26]
unsigned st_size
if (map_characters(char_map, words, st_size)) {
printf("scatter table size = %u.\n", st_size)
for (unsigned i = 0; i < 26; i++)
printf("%2d ", char_map[i])
printf("\n")
for (wlist_citer i = words.begin(); i ≠ words.end(); i++)
std::cout << i→wd << " "
<< i→n + char_map[i→first_char] + char_map[i→last_char]
<< std::endl
}
else
printf("Can't find a mapping.\n")
}
#endif
static bool
map_characters(
wlist_citer b, wlist_citer e,
int char_map[], int best_char_map[], unsigned & st_size,
const word_list & words) {
for (unsigned i = 0; i < 26; i++)
printf("%2d ", char_map[i])
printf("\n")
if (false) {
for (wlist_citer i = b; i ≠ e; i++)
std::cerr << i→wd << " "
std::cerr << std::endl
}
if (b == e) {
const index_set indices = get_key_indices(char_map, words)
const unsigned sts = *indices.rbegin() - *indices.begin() + 1
if (sts >= st_size)
return false
st_size = sts
memcpy(best_char_map, char_map, sizeof(char_map))
return true
}
const int max_map_value = 16
const word & wd = *b
int & f_map = char_map[wd.first_char]
int & l_map = char_map[wd.last_char]
if ((f_map < 0) and (l_map < 0)) {
for (f_map = 0; f_map < max_map_value; f_map++)
for (l_map = 0; l_map < max_map_value; l_map++)
if (no_collisions(char_map, words))
if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
return true
f_map = l_map = -1
}
else if (f_map < 0) {
for (f_map = 0; f_map < max_map_value; f_map++)
if (no_collisions(char_map, words))
if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
return true
f_map = -1
}
else if (l_map < 0) {
for (l_map = 0; l_map < max_map_value; l_map++)
if (no_collisions(char_map, words))
if (map_characters(b + 1, e, char_map, best_char_map, st_size, words))
return true
l_map = -1
}
else
return map_characters(b + 1, e, char_map, best_char_map, st_size, words)
return false
}
static bool
map_characters(int char_map[], const word_list & words, unsigned & st_size) {
st_size = UINT_MAX
int current_char_map[26]
memset(current_char_map, ~0, sizeof(current_char_map))
return map_characters(
words.begin(), words.end(), current_char_map, char_map, st_size, words)
}
static inline bool
no_collisions(int char_map[], const word_list & keys) {
// Return true iff the given character map hashes the given keys without
// collisions.
index_set indices
for (wlist_citer i = keys.begin(); i ≠ keys.end(); i++) {
const int
& mf = char_map[i→first_char],
& ml = char_map[i→last_char]
if (false)
std::cerr << "m[" << i→first_char << "] = " << mf
<< ", m[" << i→last_char << "] = " << ml
<< ", h(" << i→wd << ") = " << i→n + mf + ml
<< std::endl
if ((mf > -1) and (ml > -1)) {
const unsigned h = i→n + mf + ml
if (indices.count(h))
return false
indices.insert(h)
}
}
return true
}
static std::istream &
operator >> (std::istream & is, word & w) {
// Read a word from the given input stream and store it into the given
// reference; return the given input stream.
std::string wd
if (is >> wd)
w = word(wd)
return is
}
static word_list
read_words(std::istream & is) {
// Return a list of the words read from the given input stream.
word_list keys
std::copy(
std::istream_iterator<word>(is),
std::istream_iterator<word>(),
std::back_inserter(keys))
return keys
}
// $Log:$
syntax highlighted by Code2HTML, v. 0.9.1