Example Solution for Programming Assignment 2, Advanced Programming I

Advanced Programming I, Spring 2006

Programming Assignment 2 - An Example Solution

Main
The Encoding
Utility Procedures
References
Index

The only command-line option is -s, which determines whether or not the words are sorted before the character assignments are done (s for "slow"). The default is to do the sorting. This option is not part of the assignment; it was added to illustrate the effects of sorting.

<handle command-line options>= (←U)

bool do_sorting = true;

if ((2 == argc) ∧ (0 == strcmp(argv[1], "-s")))
  do_sorting = false;
else if (argc > 1) {
  print_error(std::string("Command format is \"") + argv[0] + " [-s]\"");
  exit(EXIT_FAILURE);
  }

Input

Because error handling is simple, just throw an execption if anything goes wrong during input and print the exception before exiting.

<read input>= (←U)

try { words = read_words(std::cin); }
catch (std::string err) {
  print_error(err);
  exit(EXIT_FAILURE);
  }

Read a word from the given input stream; return the list of all words read in the order read and ending with the empty string. cnt is the number of words that have been read already. Throw a string exception if anything untoward happens.

This is an example of how you can use recursion to read an arbitrary number of words from input and assign them to an array. The array is created after all the words have been read; in the mean time, each word is stored in an activation record on the run-time stack.

<input procedures>= (U→) [D→]

static std::string *
read_words(std::istream & is, unsigned cnt) {

  std::string word;
  std::string * words;

  if (is >> word) {
    check_word(word);
    words = read_words(is, cnt + 1);
    word = massage_word(word);
    remove_duplicate(words + cnt + 1, word);
    words[cnt] = word;
    }
  else if (is.eof()) {
    words = new std::string[cnt + 1];
    words[cnt] = "";
    }
  else
    throw std::string("error during word read");

  return words;
  }

Defines read_words (links are to index).

Check the given word for validity; throw a string exception if the word is invalid. To be valid a word should consist only of alphabetic characters.

<input procedures>+= (U→) [←D→]

static void
check_word(const std::string & word) {
  for (unsigned i = 0; i < word.size(); i++)
    if (not isalpha(word[i]))
      throw std::string("A word contains a non-alphabetic character");
  }

Defines check_word (links are to index).

Return the given word in lower case.

<input procedures>+= (U→) [←D]

static std::string
massage_word(std::string word) {
  for (unsigned i = 0; i < word.size(); i++) {
    const char & c = word[i];
    assert(isalpha(c));
    word[i] = tolower(c);
    }
  return word;
  }

Defines massage_word (links are to index).

Searching for Character Values

Assign and check encodings for the characters associated with the word at the given index in the given word list.

<character-value procedures>= (U→) [D→]

static bool
assign_values(
  const std::string * const words, encoding & e, unsigned cnt = 0) {

  const std::string & str = words[cnt];

  if (str.empty())
    return true;

  if (0 == e[str[0]])
    return assign_character_value(e[str[0]], words, e, cnt);

  if (0 == e[str[str.size() - 1]])
    return assign_character_value(e[str[str.size() - 1]], words, e, cnt);

  if (valid_encoding(words, e, cnt))
    return assign_values(words, e, cnt + 1);

  return false;
  }

Defines assign_values (links are to index).

<character-value procedures>+= (U→) [←D→]

static bool
assign_character_value(
  encoding::code_type & ec, 
  const std::string * const words, encoding & e, unsigned cnt) {

  for (encoding::code_type i = 1; i < max_code_value; i++) {
      ec = i;
      if (assign_values(words, e, cnt))
        return true;
      }
  ec = 0;

  return false;
  }

Defines assign_character_value (links are to index).

Return true iff the given word list contains no hash-value collisions under the given character encodings.

<character-value procedures>+= (U→) [←D]

static bool 
valid_encoding(
  const std::string * const words, const encoding & e, unsigned cnt) {
  for (unsigned i = 0; i < cnt; i++) {
    const unsigned h = hash(e, words[i]);
    for (unsigned j = i + 1; not words[j].empty(); j++) 
      if (h == hash(e, words[j]))
        return false;
    }
  return true;
  }

Defines valid_encoding (links are to index).

<search for character values>= (←U)

encoding e;

if (do_sorting)
  words = order_words(words);
if (assign_values(words, e))
  std::cout << e;
else
  print_error("No character encoding possible");

<gen-cv.cc>=

#include <string>
#include <iostream>
#include <cstdlib>
#include <cctype>
#include <cstring>
#include "encoding.h"
#include "utils.h"


// The numbers of characters in the encoding.

   unsigned encoding::char_cnt = 26;


// The largest possible code value.

   static encoding::code_type max_code_value = 15;


// Deja vu c++ style.

   static std::string * read_words(std::istream &, unsigned cnt = 0);
   static bool valid_encoding(
     const std::string * const, const encoding &, unsigned);
   static void print_error(const std::string &);
   static bool assign_values(const std::string * const, encoding &, unsigned);
   static bool assign_character_value(
     encoding::code_type &, const std::string * const, encoding &, unsigned);
   static unsigned hash(const encoding &, const std::string &);
   static void check_word(const std::string &);
   static std::string massage_word(std::string);


<character-value procedures>

static unsigned
hash(const encoding & e, const std::string & word) {

  // Return the hash value of the given word under the given character
  // encoding.

  const unsigned s = word.size();
  return s + e[word[0]] + e[word[s - 1]];
  }


<main()>

<input procedures>

std::ostream & 
operator << (std::ostream & os, const encoding & e) {

  for (unsigned i = 0; i < encoding::char_cnt; i++)
    os << static_cast<unsigned int>(e[static_cast<int>(i)]) << "\n";

  return os;
  }


static void
print_error(const std::string & emsg) {

  // Print the given error message, with decorations, to std-err.

  std::cerr << "! " << emsg << ".\n";
  }

The Encoding

An instance of the encoding class represents a mapping from characters to values; the values have type encoding::code_type. An encoding instance behaves as an array indexed by either characters 'a' through 'z' or unsigned integers 0 through 25 via the overloaded operator []; The correspondence between char and unsigned indices is the obvious one: 'a' corresponds to 0, 'b' corresponds to 1, and so on.

<encoding.h>=

#ifndef _encoding_h_defined_
#define _encoding_h_defined_


#include <ostream>


class encoding {

  public:

    // The type of the value to which characters are mapped.

       typedef unsigned char code_type;


    // The number of characters in the encodings.

       static unsigned char_cnt;


    // Return a new encoding in which all characters are mapped to 0.

       encoding() {
         memset(codes, 0, sizeof(codes));
         }

    // Return a reference to the code value for the given character.

       code_type & 
       operator [] (char c) {
         assert(('a' ≤ c) ∧ (c ≤ 'z'));
         return codes[c - 'a'];
         }

    // Return an immutable reference to the code value for the given character.

       const code_type & 
       operator [] (char c) const {
         assert(('a' ≤ c) ∧ (c ≤ 'z'));
         return codes[c - 'a'];
         }


    // Return a reference to the code value for the given index.

       code_type & 
       operator [] (int i) {
         assert((0 ≤ i) ∧ (static_cast<unsigned>(i) < char_cnt));
         return codes[i];
         }


    // Return an immutable reference to the code value for the given index.

       const code_type & 
       operator [] (int i) const {
         assert((0 ≤ i) ∧ (static_cast<unsigned>(i) < char_cnt));
         return codes[i];
         }


    // Write the given character encoding to the given output stream, returning
    // the output stream.

       friend std::ostream & 
       operator << (std::ostream &, const encoding &);

  private:

    code_type codes[26];
  };

#endif

Utility Procedures

<public utility prototypes>= (U→)

extern void remove_duplicate(std::string *, const std::string &);
extern std::string * order_words(std::string *);

The input words are treated as a set, which means there should be no duplicates. As each word is input, compare it against the list of words already input. If a duplicate is found on the list, the duplicate is removed from the list by sliding all the words (and the empty word marker) to the right of the duplidate left by one array slot, compressing out the duplicate.

remove_duplicates() assumes the word list contains no duplicates and returns immediately after a duplicate for word is removed.

<utility procedures>= (U→) [D→]

void
remove_duplicate(std::string * words, const std::string & word) {
  while (not words→empty())
    if (*words == word) {
      do *words = *(words + 1);
      while (not (++words)→empty());
      break;
      }
    else
      words++;
  }

Defines remove_duplicate (links are to index).

Now we come to the whole point of the assignment: the sort used to speed-up the brute-force search for a non-colliding character encoding.

<utility procedures>+= (U→) [←D→]

std::string *
order_words(std::string * words) {
  return pruning_order(frequency_order(words));
  }

Defines order_words (links are to index).

Assuming that freq[i] contains the letter frequency for words[i], a simple selection sort will do to order words.

<sort in decreasing fequency order>= (U→)

for (i = 0; i < cnt; i++) {
  unsigned max = i;
  for (int j = i + 1; j < cnt; j++)
    if (freq[i] > freq[max])
      max = i;
  std::swap(freq[i], freq[max]);
  std::swap(words[i], words[max]);
  }

freq[i] is the sum of the frequencies of the first and last letters of words[i]. freq[] is easiest computed using the encoding f to hold the character frequencies; that is, f[c] is the number of times the character c appears as a first or last letter in words.

Note that using an encoding to represent character frequencies is bad programming, because it makes an assumption that encoding values can be big. For the encoding implementation used here, "big" is 255, which means that words contains at least 128 words, 127 of which starts and ends with the same character. Finding character values in such a case is pointless, but supposed six months from now the encoding implementation is optimized to save space by using four bits for each character value, exploiting the maximum value limit of 15. Now the character-value generator is broken for a list of sixteen words all of which start with the same character, which is a word list the generator should be able to handle.

<compute word frequencies>= (U→)

encoding f;
int cnt = 0;

for (i = 0; not words[i].empty(); i++) {
  assert(not words[i].empty());
  f[words[i][0]]++;
  f[words[i][words[i].size() - 1]]++;
  cnt++;
  }

unsigned * freq = new unsigned [cnt];
memset(freq, 0, sizeof(unsigned)*cnt);

for (i = 0; i < cnt; i++)
  freq[i] = f[words[i][0]] + f[words[i][words[i].size() - 1]];

And that, except for some small housekeeping, is all that needs to be done to sort words in decreasing feqeuency order.

<utility procedures>+= (U→) [←D→]

static std::string *
frequency_order(std::string * words) {
  int i;

  <compute word frequencies>
  <sort in decreasing fequency order>

  delete [] freq;

  return words;
  }

Defines frequency_order (links are to index).

The next sort is a bit tricker. The goal is to check a word for collisions as soon as its first and last letters have recevied values. The word list, being ordered by decreasing letter frequency, doesn't support that goal.

For example, suppose the word list starts with "egg" and "elf" and ends with "fog". The first three letters to get values are 'e', 'g', and 'f', at which point "fog" should be checked for collisions against "egg" and "elf". However, "fog" appears at the end of the list and won't be checked until last, which is not the goal.

To reach the goal, the word list has to be re-ordered so that words can be tested as soon as they have a hash value. The re-ordering can be done by a modified form of selection sort. One index, i, scans left to right through the word list, from high-frequency to low-frequency words. At each point of the scan, some set of letters - namely the first and last letters of the words in word-list[0..i] - have received non-colliding values.

<utility procedures>+= (U→) [←D]

static std::string *
pruning_order(std::string * words) {
  
  encoding e;
  int i = 0;

  while (not words[i].empty()) {
    const std::string & str = words[i];
    e[str[0]] = 1;
    e[str[str.size() - 1]] = 1;

    for (int j = i + 1; not words[j].empty(); j++) {
      const std::string & str = words[j];
      if (e[str[0]] and e[str[str.size() - 1]]) {
        std::string str = words[j];
        for (int k = j ; i + 1 < k; k--)
          words[k] = words[k - 1];
        words[++i] = str;
        }
      }

    i++;
    }      

  return words;
  }

Defines pruning_order (links are to index).

<utils.cc>=

#include "utils.h"
#include "encoding.h"


// Deja vu c++ style.

   static std::string * pruning_order(std::string *);
   static std::string * frequency_order(std::string *);


<utility procedures>


#ifdef TESTINGUTILS

// g++ -g -DTESTINGUTILS -o test-utils -ansi -pedantic -Wall utils.cc && ./test-utils

int main() {
  std::string words[] = { "hello", "world", "" };
  pruning_order(words);
  assert((words[0] == "hello") ∧ (words[1] == "world") and words[2].empty());

  std::string wds1[] = { "hello", "world", "olleh", "" };
  pruning_order(wds1);
  assert((wds1[0] == "hello") ∧ (wds1[1] == "olleh") and 
         (wds1[2] == "world") and wds1[3].empty());

  std::string wds2[] = { "world", "hello", "olleh", "" };
  pruning_order(wds2);
  assert((wds2[0] == "world") ∧ (wds2[1] == "hello") and 
         (wds2[2] == "olleh") and wds2[3].empty());

  std::string wds3[] = { "hello", "olleh", "world", "" };
  pruning_order(wds3);
  assert((wds3[0] == "hello") ∧ (wds3[1] == "olleh") and 
         (wds3[2] == "world") and wds3[3].empty());

  std::string wds4[] = { "red", "white", "blue", "" };
  remove_duplicate(wds4, "red");
  assert((wds4[0] == "white") ∧ (wds4[1] == "blue") ∧ (wds4[2] == ""));

  remove_duplicate(wds4, "blue");
  assert((wds4[0] == "white") ∧ (wds4[1] == ""));

  remove_duplicate(wds4, "blue");
  assert((wds4[0] == "white") ∧ (wds4[1] == ""));
  }

#endif

Defines utils.cc (links are to index).

<utils.h>=

#ifndef _utils_h_defined_
#define _utils_h_defined_

#include <string>

<public utility prototypes>

#endif

Defines utils.h (links are to index).

References

Minimal Perfect Hash Functions Made Simple by Richard Cichelli, Communications of the ACM, January 1980, pages 17-19.

Index

<encoding.h>: D1
<gen-cv.cc>: D1
<main()>: D1, U2
<utils.cc>: D1
<utils.h>: D1
<character-value procedures>: D1, D2, D3, U4
<compute word frequencies>: D1, U2
<handle command-line options>: U1, D2
<input procedures>: D1, D2, D3, U4
<public utility prototypes>: D1, U2
<read input>: U1, D2
<search for character values>: U1, D2
<sort in decreasing fequency order>: D1, U2
<utility procedures>: D1, D2, D3, D4, U5

assign_character_value: U1, D2, U3
assign_values: D1, U2, U3, U4
check_word: U1, D2, U3
frequency_order: U1, D2, U3
main: D1, U2
massage_word: U1, D2, U3
order_words: U1, U2, D3
pruning_order: U1, D2, U3
read_words: U1, D2, U3
remove_duplicate: U1, U2, D3, U4
utils.cc: D1
utils.h: U1, U2, D3
valid_encoding: U1, D2, U3

This page last modified on 8 March 2006.