<main()
>= (U→)
int
main(int argc, char * argv []) {
<handle command-line options>
std::string * words;
<read input>
<search for character values>
delete [] words;
}
Definesmain
(links are to index).
Command-line Options
The only command-line option is -s
, which determines whether or not the
words are sorted before the character assignments are done (s
for "slow").
The default is to do the sorting. This option is not part of the assignment;
it was added to illustrate the effects of sorting.
<handle command-line options>= (←U) bool do_sorting = true; if ((2 == argc) ∧ (0 == strcmp(argv[1], "-s"))) do_sorting = false; else if (argc > 1) { print_error(std::string("Command format is \"") + argv[0] + " [-s]\""); exit(EXIT_FAILURE); }
Input
Because error handling is simple, just throw an execption if anything goes
wrong during input and print the exception before exiting.
<read input>= (←U) try { words = read_words(std::cin); } catch (std::string err) { print_error(err); exit(EXIT_FAILURE); }
Read a word from the given input stream; return the list of all words read in
the order read and ending with the empty string. cnt
is the number of
words that have been read already. Throw a string exception if anything
untoward happens.
This is an example of how you can use recursion to read an arbitrary number of words from input and assign them to an array. The array is created after all the words have been read; in the mean time, each word is stored in an activation record on the run-time stack.
<input procedures>= (U→) [D→] static std::string * read_words(std::istream & is, unsigned cnt) { std::string word; std::string * words; if (is >> word) { check_word(word); words = read_words(is, cnt + 1); word = massage_word(word); remove_duplicate(words + cnt + 1, word); words[cnt] = word; } else if (is.eof()) { words = new std::string[cnt + 1]; words[cnt] = ""; } else throw std::string("error during word read"); return words; }
Definesread_words
(links are to index).
Check the given word for validity; throw a string exception if the word is invalid. To be valid a word should consist only of alphabetic characters.
<input procedures>+= (U→) [←D→] static void check_word(const std::string & word) { for (unsigned i = 0; i < word.size(); i++) if (not isalpha(word[i])) throw std::string("A word contains a non-alphabetic character"); }
Definescheck_word
(links are to index).
Return the given word in lower case.
<input procedures>+= (U→) [←D] static std::string massage_word(std::string word) { for (unsigned i = 0; i < word.size(); i++) { const char & c = word[i]; assert(isalpha(c)); word[i] = tolower(c); } return word; }
Definesmassage_word
(links are to index).
Searching for Character Values
Assign and check encodings for the characters associated with the word at the
given index in the given word list.
<character-value procedures>= (U→) [D→] static bool assign_values( const std::string * const words, encoding & e, unsigned cnt = 0) { const std::string & str = words[cnt]; if (str.empty()) return true; if (0 == e[str[0]]) return assign_character_value(e[str[0]], words, e, cnt); if (0 == e[str[str.size() - 1]]) return assign_character_value(e[str[str.size() - 1]], words, e, cnt); if (valid_encoding(words, e, cnt)) return assign_values(words, e, cnt + 1); return false; }
Definesassign_values
(links are to index).
<character-value procedures>+= (U→) [←D→] static bool assign_character_value( encoding::code_type & ec, const std::string * const words, encoding & e, unsigned cnt) { for (encoding::code_type i = 1; i < max_code_value; i++) { ec = i; if (assign_values(words, e, cnt)) return true; } ec = 0; return false; }
Definesassign_character_value
(links are to index).
Return true iff the given word list contains no hash-value collisions under the given character encodings.
<character-value procedures>+= (U→) [←D] static bool valid_encoding( const std::string * const words, const encoding & e, unsigned cnt) { for (unsigned i = 0; i < cnt; i++) { const unsigned h = hash(e, words[i]); for (unsigned j = i + 1; not words[j].empty(); j++) if (h == hash(e, words[j])) return false; } return true; }
Definesvalid_encoding
(links are to index).
<search for character values>= (←U) encoding e; if (do_sorting) words = order_words(words); if (assign_values(words, e)) std::cout << e; else print_error("No character encoding possible");
<gen-cv.cc
>= #include <string> #include <iostream> #include <cstdlib> #include <cctype> #include <cstring> #include "encoding.h" #include "utils.h" // The numbers of characters in the encoding. unsigned encoding::char_cnt = 26; // The largest possible code value. static encoding::code_type max_code_value = 15; // Deja vu c++ style. static std::string * read_words(std::istream &, unsigned cnt = 0); static bool valid_encoding( const std::string * const, const encoding &, unsigned); static void print_error(const std::string &); static bool assign_values(const std::string * const, encoding &, unsigned); static bool assign_character_value( encoding::code_type &, const std::string * const, encoding &, unsigned); static unsigned hash(const encoding &, const std::string &); static void check_word(const std::string &); static std::string massage_word(std::string); <character-value procedures> static unsigned hash(const encoding & e, const std::string & word) { // Return the hash value of the given word under the given character // encoding. const unsigned s = word.size(); return s + e[word[0]] + e[word[s - 1]]; } <main()
> <input procedures> std::ostream & operator << (std::ostream & os, const encoding & e) { for (unsigned i = 0; i < encoding::char_cnt; i++) os << static_cast<unsigned int>(e[static_cast<int>(i)]) << "\n"; return os; } static void print_error(const std::string & emsg) { // Print the given error message, with decorations, to std-err. std::cerr << "! " << emsg << ".\n"; }
encoding::code_type
. An encoding instance
behaves as an array indexed by either characters 'a' through 'z' or unsigned
integers 0 through 25 via the overloaded operator []
; The correspondence
between char
and unsigned
indices is the obvious one: 'a' corresponds
to 0, 'b' corresponds to 1, and so on.
<encoding.h
>=
#ifndef _encoding_h_defined_
#define _encoding_h_defined_
#include <ostream>
class encoding {
public:
// The type of the value to which characters are mapped.
typedef unsigned char code_type;
// The number of characters in the encodings.
static unsigned char_cnt;
// Return a new encoding in which all characters are mapped to 0.
encoding() {
memset(codes, 0, sizeof(codes));
}
// Return a reference to the code value for the given character.
code_type &
operator [] (char c) {
assert(('a' ≤ c) ∧ (c ≤ 'z'));
return codes[c - 'a'];
}
// Return an immutable reference to the code value for the given character.
const code_type &
operator [] (char c) const {
assert(('a' ≤ c) ∧ (c ≤ 'z'));
return codes[c - 'a'];
}
// Return a reference to the code value for the given index.
code_type &
operator [] (int i) {
assert((0 ≤ i) ∧ (static_cast<unsigned>(i) < char_cnt));
return codes[i];
}
// Return an immutable reference to the code value for the given index.
const code_type &
operator [] (int i) const {
assert((0 ≤ i) ∧ (static_cast<unsigned>(i) < char_cnt));
return codes[i];
}
// Write the given character encoding to the given output stream, returning
// the output stream.
friend std::ostream &
operator << (std::ostream &, const encoding &);
private:
code_type codes[26];
};
#endif
<public utility prototypes>= (U→) extern void remove_duplicate(std::string *, const std::string &); extern std::string * order_words(std::string *);
The input words are treated as a set, which means there should be no duplicates. As each word is input, compare it against the list of words already input. If a duplicate is found on the list, the duplicate is removed from the list by sliding all the words (and the empty word marker) to the right of the duplidate left by one array slot, compressing out the duplicate.
remove_duplicates()
assumes the word list contains no duplicates
and returns immediately after a duplicate for word
is removed.
<utility procedures>= (U→) [D→] void remove_duplicate(std::string * words, const std::string & word) { while (not words→empty()) if (*words == word) { do *words = *(words + 1); while (not (++words)→empty()); break; } else words++; }
Definesremove_duplicate
(links are to index).
Now we come to the whole point of the assignment: the sort used to speed-up the brute-force search for a non-colliding character encoding.
<utility procedures>+= (U→) [←D→] std::string * order_words(std::string * words) { return pruning_order(frequency_order(words)); }
Definesorder_words
(links are to index).
Assuming that freq[i]
contains the letter frequency for words[i]
, a
simple selection sort will do to order words
.
<sort in decreasing fequency order>= (U→) for (i = 0; i < cnt; i++) { unsigned max = i; for (int j = i + 1; j < cnt; j++) if (freq[i] > freq[max]) max = i; std::swap(freq[i], freq[max]); std::swap(words[i], words[max]); }
freq[i]
is the sum of the frequencies of the first and last letters of
words[i]
. freq[]
is easiest computed using the encoding f
to hold
the character frequencies; that is, f[c]
is the number of times the
character c
appears as a first or last letter in words
.
Note that using an encoding to represent character frequencies is bad
programming, because it makes an assumption that encoding values can be big.
For the encoding implementation used here, "big" is 255, which means that
words
contains at least 128 words, 127 of which starts and ends with the
same character. Finding character values in such a case is pointless, but
supposed six months from now the encoding implementation is optimized to save
space by using four bits for each character value, exploiting the maximum value
limit of 15. Now the character-value generator is broken for a list of sixteen
words all of which start with the same character, which is a word list the
generator should be able to handle.
<compute word frequencies>= (U→) encoding f; int cnt = 0; for (i = 0; not words[i].empty(); i++) { assert(not words[i].empty()); f[words[i][0]]++; f[words[i][words[i].size() - 1]]++; cnt++; } unsigned * freq = new unsigned [cnt]; memset(freq, 0, sizeof(unsigned)*cnt); for (i = 0; i < cnt; i++) freq[i] = f[words[i][0]] + f[words[i][words[i].size() - 1]];
And that, except for some small housekeeping, is all that needs to be done to sort words in decreasing feqeuency order.
<utility procedures>+= (U→) [←D→] static std::string * frequency_order(std::string * words) { int i; <compute word frequencies> <sort in decreasing fequency order> delete [] freq; return words; }
Definesfrequency_order
(links are to index).
The next sort is a bit tricker. The goal is to check a word for collisions as soon as its first and last letters have recevied values. The word list, being ordered by decreasing letter frequency, doesn't support that goal.
For example, suppose the word list starts with "egg" and "elf" and ends with "fog". The first three letters to get values are 'e', 'g', and 'f', at which point "fog" should be checked for collisions against "egg" and "elf". However, "fog" appears at the end of the list and won't be checked until last, which is not the goal.
To reach the goal, the word list has to be re-ordered so that words can be
tested as soon as they have a hash value. The re-ordering can be done by a
modified form of selection sort. One index, i
, scans left to right through
the word list, from high-frequency to low-frequency words. At each point of
the scan, some set of letters - namely the first and last letters of the words
in word-list[0..i]
- have received non-colliding values.
<utility procedures>+= (U→) [←D] static std::string * pruning_order(std::string * words) { encoding e; int i = 0; while (not words[i].empty()) { const std::string & str = words[i]; e[str[0]] = 1; e[str[str.size() - 1]] = 1; for (int j = i + 1; not words[j].empty(); j++) { const std::string & str = words[j]; if (e[str[0]] and e[str[str.size() - 1]]) { std::string str = words[j]; for (int k = j ; i + 1 < k; k--) words[k] = words[k - 1]; words[++i] = str; } } i++; } return words; }
Definespruning_order
(links are to index).
<utils.cc
>=
#include "utils.h"
#include "encoding.h"
// Deja vu c++ style.
static std::string * pruning_order(std::string *);
static std::string * frequency_order(std::string *);
<utility procedures>
#ifdef TESTINGUTILS
// g++ -g -DTESTINGUTILS -o test-utils -ansi -pedantic -Wall utils.cc && ./test-utils
int main() {
std::string words[] = { "hello", "world", "" };
pruning_order(words);
assert((words[0] == "hello") ∧ (words[1] == "world") and words[2].empty());
std::string wds1[] = { "hello", "world", "olleh", "" };
pruning_order(wds1);
assert((wds1[0] == "hello") ∧ (wds1[1] == "olleh") and
(wds1[2] == "world") and wds1[3].empty());
std::string wds2[] = { "world", "hello", "olleh", "" };
pruning_order(wds2);
assert((wds2[0] == "world") ∧ (wds2[1] == "hello") and
(wds2[2] == "olleh") and wds2[3].empty());
std::string wds3[] = { "hello", "olleh", "world", "" };
pruning_order(wds3);
assert((wds3[0] == "hello") ∧ (wds3[1] == "olleh") and
(wds3[2] == "world") and wds3[3].empty());
std::string wds4[] = { "red", "white", "blue", "" };
remove_duplicate(wds4, "red");
assert((wds4[0] == "white") ∧ (wds4[1] == "blue") ∧ (wds4[2] == ""));
remove_duplicate(wds4, "blue");
assert((wds4[0] == "white") ∧ (wds4[1] == ""));
remove_duplicate(wds4, "blue");
assert((wds4[0] == "white") ∧ (wds4[1] == ""));
}
#endif
Definesutils.cc
(links are to index).
<utils.h
>=
#ifndef _utils_h_defined_
#define _utils_h_defined_
#include <string>
<public utility prototypes>
#endif
Definesutils.h
(links are to index).
References
Minimal Perfect Hash Functions Made Simple by Richard
Cichelli, Communications of the ACM, January 1980, pages 17-19.
encoding.h
>: D1
gen-cv.cc
>: D1
main()
>: D1, U2
utils.cc
>: D1
utils.h
>: D1