mitm.cc

// CS 305 - Computer Algorithms I
// Fall 2002.
//
// Example solution for Programming Assignment 3b.

#include <iostream>
#include <iomanip>
#include "mitm.h"
#include <vector>

typedef const char * ccp;


// Where the word information lives.

   struct word_info {

     unsigned cnt;
     std::string word;
     word_info * next;

     word_info(const std::string & w, word_info * wip) 
       : cnt(1), word(w), next(wip) { }

     word_info() : cnt(0), next(0) { }
     };


// Deja vu, c++ style.

   static bool is_tag_name(ccp, ccp, ccp);
   static ccp skip_space(ccp, ccp);
   static bool find_tag(ccp &, ccp &, ccp, ccp);
   static bool outline_tag(ccp &, ccp &, ccp);
   static word_info * sort_count(word_info *);

   static std::string lowercasify(ccp, ccp);


static bool
body_ends(ccp & body_start, ccp & body_end, ccp end) {

  // Look for the body in the range [body_start, body_end); if found, set
  // body_start to the first byte, body_end to one past the last byte, and
  // return true.  Otherwise return false.

  if (not find_tag(body_start, body_end, end, "body"))
    return false;

  body_start = body_end;
  ccp bs = body_start;
  if (not find_tag(bs, body_end, end, "/body"))
    return false;

  body_end = bs;

  return true;
  }


static void
clean_the_body(ccp start, ccp end) {

  // Remove html cruft from the text in the range [start, end).

  assert(start and end);

  ccp 
    tag_start = start,
    tag_end;

  while (outline_tag(tag_start, tag_end, end)) {
    memset(const_cast<char *>(tag_start), ' ', tag_end - tag_start);
    tag_start = tag_end;
    }
  }


static bool
find_tag(ccp & tag_start, ccp & tag_end, ccp end, ccp tag_name) {

  // Return true iff the exists a complete tag in the range [tag_start, end)
  // with the given name.  If a complete tag exists, set tag_start and tag_end
  // to the start and one character past the end (respectively) of the
  // left-most complete tag; otherwise, the values of tag_start and tag_end are
  // undefined.

  assert(tag_start and end and tag_name);

  while (outline_tag(tag_start, tag_end, end)) {
    if (is_tag_name(tag_start, tag_end, tag_name))
      return true;
    tag_start = tag_end;
    }

  return false;
  }


static bool
find_word(ccp & word_start, ccp & word_end, ccp end) {

  // Find the leftmost maximal sequence of alpahabetic characters in the range
  // [word_start, end).  If such a sequence exists, bracket it by the range
  // [word_start, word_end) and return true; otherwise return false.

  while ((word_start < end) and not isalpha(*word_start)) word_start++;
  if (word_start >= end) return false;

  word_end = word_start + 1;
  while ((word_end < end) and isalpha(*word_end)) word_end++;

  return true;
  }


static word_info *
find_words(ccp start, ccp end) {

  // Find all the words in the range [start, end) and store them in a linked
  // list; return the head of the list.

  word_info wi;
  wi.next = 0;

  ccp wend;

  while (find_word(start, wend, end)) {
    const std::string word = lowercasify(start, wend);
    start = wend;

    word_info * wip = &wi;

    while (wip->next and (word > wip->next->word))
      wip = wip->next;

    if (not wip->next or (word < wip->next->word))
      wip->next = new word_info(word, wip->next);
    else
      wip->next->cnt++;
    }

  return wi.next;
  }


static bool
get_the_body(const resource & document, ccp & bstart, ccp & bend) {

  // 

  bstart = document.data;

  if (not body_ends(bstart, bend, document.data + document.size))
    return false;

  const size_t body_size = bend - bstart;
  char * body = new char [body_size];
  memcpy(body, bstart, body_size);

  bstart = body;
  bend = body + body_size;

  clean_the_body(bstart, bend);

  return true;
  }


static bool 
is_tag_name(ccp tag_start, ccp tag_end, ccp tag_name) {
  
  // Return true iff the tag delimited by [tag_start, tag_end) has the given
  // tag name.

  assert(tag_start and tag_name);

  if (*tag_start == '<') tag_start++;
  tag_start = skip_space(tag_start, tag_end);
  while ((tag_start < tag_end)
	 and (tolower(*tag_start) == tolower(*tag_name))
	 and *tag_name) {
    tag_name++;
    tag_start++;
    }

  return not *tag_name and ((tag_start >= tag_end) or not isalpha(*tag_start));
  }


static std::string
lowercasify(ccp start, ccp end) {

  // Return as a string the characters in the range [start, end), converting
  // the characters to lowercase.

  assert(start and end and (start <= end));

  std::string str(start, end);
  for(std::string::size_type i = 0; i < str.size(); i++)
    str[i] = tolower(str[i]);

  return str;
  }


void 
mitm(const std::string & uri, const resource & document) {

  // Process the given resource from the given uri.

  ccp body_start, body_end;

  if (not get_the_body(document, body_start, body_end))
    return;

  word_info * words = sort_count(find_words(body_start, body_end));

  for (word_info * wip = words; wip; wip = wip->next)
    std::cout << std::setw(4) << wip->cnt << " " << wip->word << "\n";

  while (words) {
    word_info * next = words->next;
    delete words;
    words = next;
    }

  delete [] body_start;
  }


static bool
outline_tag(ccp & tag_start, ccp & tag_end, ccp end) {

  // Return true iff the exists a paired set of angle brackets < > in the range
  // [tag_start, end).  If a pair exists, set tag_start to point to the < and
  // tag_end to point to one to the right of the >; otherwise, the values of
  // tag_start and tag_end are undefined.

  assert(tag_start <= end);

  tag_start = reinterpret_cast<ccp>(memchr(tag_start, '<', end - tag_start));
  if (not tag_start)
    return false;

  tag_end = reinterpret_cast<ccp>(memchr(tag_start, '>', end - tag_start));
  if (tag_end) tag_end++;

  return tag_end;
  }


static ccp
skip_space(ccp start, ccp end) {
  
  // Return the left-most non-space character in the range [start, end) or end
  // if there's no such character.

  assert(start);

  while ((start < end) and isspace(*start)) start++;

  return start;
  }


static word_info *
sort_count(word_info * words) {

  // Sort the given list of words in ascending order by word count.  Return the
  // head of the newly sorted list.


  word_info wi;
  wi.next = words;

  for (word_info * i = &wi; i->next; i = i->next) {
    word_info * before_min = i;
    for (word_info * j = before_min->next; j->next; j = j->next)
      if (before_min->next->cnt > j->next->cnt)
	before_min = j;

    word_info * const min = before_min->next;
    before_min->next = min->next;
    min->next = i->next;
    i->next = min;
    }

  for (word_info * i = &wi; i->next and i->next->next; i = i->next)
    assert(i->next->cnt <= i->next->next->cnt);

  return wi.next;
  }


#ifdef TESTING

// g++ -o testing-mitm -gstabs -DTESTING -ansi -pedantic mitm.cc && testing-mitm

int
main() {
  ccp tag = "< a  >";
  assert(is_tag_name(tag, tag + strlen(tag), "a"));
  tag = "<a>";
  assert(is_tag_name(tag, tag + strlen(tag), "a"));
  tag = "<alpha>";
  assert(not is_tag_name(tag, tag + strlen(tag), "a"));

  tag = "<body> </body>";
  ccp tag_end;
  assert(body_ends(tag, tag_end, tag + strlen(tag)));
  assert(tag + 1 == tag_end);

  tag = "<body></body>";
  assert(body_ends(tag, tag_end, tag + strlen(tag)));
  assert(tag == tag_end);

  tag = "<body><body>";
  assert(not body_ends(tag, tag_end, tag + strlen(tag)));
  }

#endif

// $Log: mitm.cc,v $
// Revision 1.3  2003/10/27 16:59:28  rclayton
// Add a check to make sure the list is sorted.
//
// Revision 1.2  2003/10/25 03:19:32  rclayton
// Use a list of words and counts rather than a vector of them.
//
// Revision 1.1  2003/10/11 22:39:06  rclayton
// Initial revision
//
syntax highlighted by Code2HTML, v. 0.9