This problem and the solution presented here is based on PORTPREP: A Portable Repeated String Finder by L. Jones, E. Gassie, and S. Radhakrishnan, Software - Practice and Experience, January, 1989 (Vol. 19, No. 1), Pages 64-77.
<main.cc
>=
#include <cstdlib>
#include "tuple.h"
int main(int argc, char * argv[]) {
natural min, max;
<process command-line arguments>
init_tuples(max);
string word;
while (cin >> word)
insert_tuples(word);
print_tuples(cout, min, max);
term_tuples();
}
Definesmain
(links are to index).
There may be zero, one, or two command-line arguments.
<process command-line arguments>= (<-U) switch (argc) { case 1: min = max = 1; break; case 2: min = max = atoi(argv[1]); break; case 3: min = atoi(argv[1]); max = atoi(argv[2]); if (min > max) { const natural t = min; min = max; max = t; } break; default: cerr << "Command format is \"" << argv[0] << " [min] [max]\".\n"; return EXIT_FAILURE; }
<tuple.h
>=
#ifndef _tuple_h_defined_
#define _tuple_h_defined_
#include <string>
#include <fstream>
typedef unsigned int natural;
extern void init_tuples(natural);
extern void insert_tuples(const string &);
extern void print_tuples(ostream &, natural, natural);
extern void term_tuples(void);
#endif
<tuple.cc
>= #include <cassert> #include <vector> #include <cctype> #include "tuple.h" <procedure to lower-caseify a string> <tuple.cc
data> <tuple.cc
code>
Each node stores a word. The word itself is stored in word
, while a
lower-case version of the word is stored in key
to simplify comparisons.
The nodes are arranged in an n-ary tree using the sibling-child pointer representation. This representation has the advantage of needing only two pointers per node, but yet is capable of representing a parent with an arbitrary number of children. The cost of this representation is a slight increase in complexity when traversing the tree.
The root of the tree holds the empty word ""
. A node n
at level
i > 0 represents the last word of an i-tuple; the whole tuple can
be found by traversing the tree back to the root. The value n.count
is the
number of times the i-tuple has appeared so far.
<tuple.cc
data>= (U->) [D->]
struct node {
const string key;
const string word;
natural count;
node * children;
node * siblings;
node(const string w)
: key(lower_case(w)), word(w), count(0), children(0), siblings(0)
{ }
};
Definesnode
(links are to index).
As promised, the root of the tree holding the tuples represents the empty string.
<tuple.cc
data>+= (U->) [<-D->]
static struct node tuple_tree("");
Definestuple_tree
(links are to index).
The node pointer in
level[i]
points to the last word of the current
i
-tuple in the tree; level[0]
points to the root of the tree.
level
contains max_tuple_size
elements.
<tuple.cc
data>+= (U->) [<-D->]
static natural max_tuple_size;
static vector<node *> level;
Defineslevel
,max_tuple_size
(links are to index).
junk
is a little hack to give elements in level
something to point to
until max_tuple_size
words have been read, at which point all elements in
level
point into tuple_tree
. junk
is not used thereafter.
<tuple.cc
data>+= (U->) [<-D]
static struct node junk("");
Definesjunk
(links are to index).
Initialize the tuple counter to count upto max
-tuples; max
must be
positive.
<tuple.cc
code>= (U->) [D->]
void init_tuples(natural max) {
assert(max > 0);
max_tuple_size = max;
for (natural i = 0; i < max_tuple_size; i++)
level.push_back(&junk);
level[0] = &tuple_tree;
}
Definesinit_tuples
(links are to index).
Given nptr
, a pointer to a node, and the word w
, return a pointer to
the child of *nptr
that has w
as its key. If *nptr
doesn't have
such a child, make one and return a pointer to the new child.
<tuple.cc
code>+= (U->) [<-D->]
static node * find_child(node * nptr, const string & w) {
node * np;
string key = lower_case(w);
for (np = nptr->children; np != 0; np = np->siblings)
if (np->key == key)
return np;
np = new node(w);
np->siblings = nptr->children;
nptr->children = np;
return np;
}
Definesfind_child
(links are to index).
Append the word w
to the current set of tuples, increasing the count of
each new tuple by one. After the new tuples have been counted, adjust the
tuple set by tossing out the largest tuple and adding the smallest tuple.
<tuple.cc
code>+= (U->) [<-D->]
void insert_tuples(const string & word) {
natural i;
for (i = 0; i < max_tuple_size; i++) {
level[i] = find_child(level[i], word);
level[i]->count++;
}
for (i = max_tuple_size - 1; i > 0; i--)
level[i] = level[i - 1];
level[0] = &tuple_tree;
}
Definesinsert_tuples
(links are to index).
<tuple.cc
code>+= (U->) [<-D->]
static void print(
ostream &, const string & , const node *, natural, natural, natural);
static void print_children(
ostream & outs, const string & s, const node * parent_p,
natural min, natural max, natural current) {
for (const node * np = parent_p->children; np; np = np->siblings)
print(outs, s, np, min, max, current + 1);
}
Definesprint_children
(links are to index).
Print to the output stream outs
the count and tuples for all min
- to
max
-tuples collected so far.
<tuple.cc
code>+= (U->) [<-D->]
static void print(
ostream & outs, const string & tuple, const node * const nptr,
natural min, natural max, natural current) {
if (current <= max) {
const string new_tuple = tuple + " " + nptr->word;
if (min <= current)
outs << nptr->count << " " << new_tuple << "\n";
print_children(outs, new_tuple, nptr, min, max, current);
}
}
Print to the output stream outs
the count and tuples for all min
- to
max
-tuples collected so far.
<tuple.cc
code>+= (U->) [<-D->]
void print_tuples(ostream & outs, natural min, natural max) {
print_children(outs, "", &tuple_tree, min, max, 0);
}
Definesprint_tuples
(links are to index).
Free the node *np
, after deleting *np
's children and siblings. This
procedure eat stack space as if it were candy, but it's simple to write and
understand.
<tuple.cc
code>+= (U->) [<-D->]
static void free(node * np) {
if (np) {
free(np->children);
free(np->siblings);
delete np;
}
}
Definesfree
(links are to index).
Free all the nodes allocated to count the tuples.
<tuple.cc
code>+= (U->) [<-D]
void term_tuples(void) {
free(tuple_tree.children);
free(tuple_tree.siblings);
free(junk.children);
free(junk.siblings);
}
Definesterm_tuples
(links are to index).
Return a copy of s
in which all letters are lower case.
<procedure to lower-caseify a string>= (U->) static string lower_case(string s) { for (int i = s.size() - 1; i >= 0; i--) s[i] = tolower(s[i]); return s; }
Defineslower_case
(links are to index).
main.cc
>: D1
tuple.cc
>: D1
tuple.cc
code>: U1, D2, D3, D4, D5, D6, D7, D8, D9
tuple.cc
data>: U1, D2, D3, D4, D5
tuple.h
>: D1
This page last modified on 27 October 2001.