// CS 305 - Computer Algorithms I
// Fall 2002.
//
// Example solution for Programming Assignment 3b.
#include <iostream>
#include <iomanip>
#include "mitm.h"
#include <vector>
typedef const char * ccp;
// Where the word information lives.
struct word_info {
unsigned cnt;
std::string word;
word_info * next;
word_info(const std::string & w, word_info * wip)
: cnt(1), word(w), next(wip) { }
word_info() : cnt(0), next(0) { }
};
// Deja vu, c++ style.
static bool is_tag_name(ccp, ccp, ccp);
static ccp skip_space(ccp, ccp);
static bool find_tag(ccp &, ccp &, ccp, ccp);
static bool outline_tag(ccp &, ccp &, ccp);
static word_info * sort_count(word_info *);
static std::string lowercasify(ccp, ccp);
static bool
body_ends(ccp & body_start, ccp & body_end, ccp end) {
// Look for the body in the range [body_start, body_end); if found, set
// body_start to the first byte, body_end to one past the last byte, and
// return true. Otherwise return false.
if (not find_tag(body_start, body_end, end, "body"))
return false;
body_start = body_end;
ccp bs = body_start;
if (not find_tag(bs, body_end, end, "/body"))
return false;
body_end = bs;
return true;
}
static void
clean_the_body(ccp start, ccp end) {
// Remove html cruft from the text in the range [start, end).
assert(start and end);
ccp
tag_start = start,
tag_end;
while (outline_tag(tag_start, tag_end, end)) {
memset(const_cast<char *>(tag_start), ' ', tag_end - tag_start);
tag_start = tag_end;
}
}
static bool
find_tag(ccp & tag_start, ccp & tag_end, ccp end, ccp tag_name) {
// Return true iff the exists a complete tag in the range [tag_start, end)
// with the given name. If a complete tag exists, set tag_start and tag_end
// to the start and one character past the end (respectively) of the
// left-most complete tag; otherwise, the values of tag_start and tag_end are
// undefined.
assert(tag_start and end and tag_name);
while (outline_tag(tag_start, tag_end, end)) {
if (is_tag_name(tag_start, tag_end, tag_name))
return true;
tag_start = tag_end;
}
return false;
}
static bool
find_word(ccp & word_start, ccp & word_end, ccp end) {
// Find the leftmost maximal sequence of alpahabetic characters in the range
// [word_start, end). If such a sequence exists, bracket it by the range
// [word_start, word_end) and return true; otherwise return false.
while ((word_start < end) and not isalpha(*word_start)) word_start++;
if (word_start >= end) return false;
word_end = word_start + 1;
while ((word_end < end) and isalpha(*word_end)) word_end++;
return true;
}
static word_info *
find_words(ccp start, ccp end) {
// Find all the words in the range [start, end) and store them in a linked
// list; return the head of the list.
word_info wi;
wi.next = 0;
ccp wend;
while (find_word(start, wend, end)) {
const std::string word = lowercasify(start, wend);
start = wend;
word_info * wip = &wi;
while (wip->next and (word > wip->next->word))
wip = wip->next;
if (not wip->next or (word < wip->next->word))
wip->next = new word_info(word, wip->next);
else
wip->next->cnt++;
}
return wi.next;
}
static bool
get_the_body(const resource & document, ccp & bstart, ccp & bend) {
//
bstart = document.data;
if (not body_ends(bstart, bend, document.data + document.size))
return false;
const size_t body_size = bend - bstart;
char * body = new char [body_size];
memcpy(body, bstart, body_size);
bstart = body;
bend = body + body_size;
clean_the_body(bstart, bend);
return true;
}
static bool
is_tag_name(ccp tag_start, ccp tag_end, ccp tag_name) {
// Return true iff the tag delimited by [tag_start, tag_end) has the given
// tag name.
assert(tag_start and tag_name);
if (*tag_start == '<') tag_start++;
tag_start = skip_space(tag_start, tag_end);
while ((tag_start < tag_end)
and (tolower(*tag_start) == tolower(*tag_name))
and *tag_name) {
tag_name++;
tag_start++;
}
return not *tag_name and ((tag_start >= tag_end) or not isalpha(*tag_start));
}
static std::string
lowercasify(ccp start, ccp end) {
// Return as a string the characters in the range [start, end), converting
// the characters to lowercase.
assert(start and end and (start <= end));
std::string str(start, end);
for(std::string::size_type i = 0; i < str.size(); i++)
str[i] = tolower(str[i]);
return str;
}
void
mitm(const std::string & uri, const resource & document) {
// Process the given resource from the given uri.
ccp body_start, body_end;
if (not get_the_body(document, body_start, body_end))
return;
word_info * words = sort_count(find_words(body_start, body_end));
for (word_info * wip = words; wip; wip = wip->next)
std::cout << std::setw(4) << wip->cnt << " " << wip->word << "\n";
while (words) {
word_info * next = words->next;
delete words;
words = next;
}
delete [] body_start;
}
static bool
outline_tag(ccp & tag_start, ccp & tag_end, ccp end) {
// Return true iff the exists a paired set of angle brackets < > in the range
// [tag_start, end). If a pair exists, set tag_start to point to the < and
// tag_end to point to one to the right of the >; otherwise, the values of
// tag_start and tag_end are undefined.
assert(tag_start <= end);
tag_start = reinterpret_cast<ccp>(memchr(tag_start, '<', end - tag_start));
if (not tag_start)
return false;
tag_end = reinterpret_cast<ccp>(memchr(tag_start, '>', end - tag_start));
if (tag_end) tag_end++;
return tag_end;
}
static ccp
skip_space(ccp start, ccp end) {
// Return the left-most non-space character in the range [start, end) or end
// if there's no such character.
assert(start);
while ((start < end) and isspace(*start)) start++;
return start;
}
static word_info *
sort_count(word_info * words) {
// Sort the given list of words in ascending order by word count. Return the
// head of the newly sorted list.
word_info wi;
wi.next = words;
for (word_info * i = &wi; i->next; i = i->next) {
word_info * before_min = i;
for (word_info * j = before_min->next; j->next; j = j->next)
if (before_min->next->cnt > j->next->cnt)
before_min = j;
word_info * const min = before_min->next;
before_min->next = min->next;
min->next = i->next;
i->next = min;
}
for (word_info * i = &wi; i->next and i->next->next; i = i->next)
assert(i->next->cnt <= i->next->next->cnt);
return wi.next;
}
#ifdef TESTING
// g++ -o testing-mitm -gstabs -DTESTING -ansi -pedantic mitm.cc && testing-mitm
int
main() {
ccp tag = "< a >";
assert(is_tag_name(tag, tag + strlen(tag), "a"));
tag = "<a>";
assert(is_tag_name(tag, tag + strlen(tag), "a"));
tag = "<alpha>";
assert(not is_tag_name(tag, tag + strlen(tag), "a"));
tag = "<body> </body>";
ccp tag_end;
assert(body_ends(tag, tag_end, tag + strlen(tag)));
assert(tag + 1 == tag_end);
tag = "<body></body>";
assert(body_ends(tag, tag_end, tag + strlen(tag)));
assert(tag == tag_end);
tag = "<body><body>";
assert(not body_ends(tag, tag_end, tag + strlen(tag)));
}
#endif
// $Log: mitm.cc,v $
// Revision 1.3 2003/10/27 16:59:28 rclayton
// Add a check to make sure the list is sorted.
//
// Revision 1.2 2003/10/25 03:19:32 rclayton
// Use a list of words and counts rather than a vector of them.
//
// Revision 1.1 2003/10/11 22:39:06 rclayton
// Initial revision
//
syntax highlighted by Code2HTML, v. 0.9