<url forward definitions>= static bool find_anchor_tag(csp &, csp &, csp);
Used below; next definition.
<url forward definitions>+= static csp extract_href_value(csp, csp);
Used below; previous definition.
<print-url procedures>= void print_urls(std::ostream & os, const resource & document) { // Write to os, one per line, all the href values for the anchor tags found // in the given document. unsigned fcnt = 0; const std::string * fragments = fragment(document.data, document.data + document.size, fcnt); const std::string * tag_start = fragments, * tag_end; while (find_anchor_tag(tag_start, tag_end, fragments + fcnt)) { csp href = extract_href_value(tag_start, tag_end); if (href) { assert((*href).size() > 1); assert((*href)[0] == '"'); os << (*href).substr(1, (*href).size() - 2) << "\n"; } tag_start = tag_end + 1; } delete [] fragments; }
Definesprint_urls
(links are to index).
<print-url procedures>+= static bool find_anchor_tag(csp & tag_start, csp & tag_end, csp end) { // Return true iff the exists a complete anchor tag in the range [tag_start, // end). If a complete anchor tag does exist, set tag_start and tag_end to // the start and end (respectively) of the left-most complete anchor tag; // otherwise, the values of tag_start and tag_end are undefined. while (true) { while ((tag_start < end) and (*tag_start != "<")) tag_start++; if (tag_start >= end) return false; tag_end = tag_start + 1; if ((tag_end < end) and same_string(*tag_end, "a")) break; tag_start = tag_end; } while ((++tag_end < end) and (*tag_end != ">")) { } return tag_end < end; }
Definesfind_anchor_tag
(links are to index).Used below; previous and next definitions.
href
followed =
; the next piece
must be a string.
<print-url procedures>+= static csp extract_href_value(csp tag_start, csp tag_end) { // Return a pointer to the value of the href attribute found in the tag // delimited by [tag_start, tag_end). Return 0 if there's no such value. csp href = tag_start; const std::string href_str = "href"; while ((href < tag_end) and not same_string(*href, href_str)) href++; if (href >= tag_end) return 0; if ((++href >= tag_end) or (*href != "=")) return 0; if ((++href >= tag_end) or ((*href).at(0) != '"')) return 0; return href; }
Definesextract_href_value
(links are to index).Used below; previous and next definitions.
<
and >
, the words a
and href
, and
strings. Before we go off in search of anchor tag HREF url values, it makes
sense to clear the underbrush by removing from the document all the stuff we're
not interested in.
While we're at it, we might as well restructure the document so it's easier to search. Rather than constantly looking for the start and end of document pieces, it would be better to find the pieces all at once at the beginning, so the rest of the code can deal with something at a higher level than a character sequence. Strings are a good place to store the document pieces of interest, and storing the strings in an array lets us step through the document by indexing, which is handy.
Ah, but there's a problem: how big is the array? Or equivalently, how many pieces are in the document? If we could store the pieces in an array, we could count them. But before we can make the array we have to count the pieces so we know how big the array is.
Fortunately, our old friend recursion comes to help us out of this quandary. It's easy to write a procedure that can count either zero or one pieces in a document; it's just an if statement. If the procedure doesn't find any pieces in the document, then we're done. If the document finds a piece, then the procedure can recursively call itself on the remaining part of the document following the piece. If we keep count along the way, then we'll know, after the last piece is found, how many pieces there were in total.
The fragment()
code below is a bit more liberal than the description given
above. It fragments a document into three kinds of pieces: maximal letter
sequences, double-quote delimited strings, and single characters for whatever's
left. This is wasteful and stupid, but it gets the job done (and will be fixed
up in the next assignment).
<print-url procedures>+= static std::string * fragment(ccp start, ccp end, unsigned & fcnt) { // Fragment the character array given by the range [start, end); // return the fragments as an array of strings; the number of strings // are stored in fcnt. while ((start < end) and isspace(*start)) start++; if (start >= end) return new std::string [fcnt]; const ccp copy = start; if ('"' == *start) { while ((++start < end) and ('"' != *start)) { } if (start >= end) return new std::string [fcnt]; start++; } else if (isalpha(*start)) while ((++start < end) and isalpha(*start)) { } else start++; const unsigned fno = fcnt++; std::string * fragments = fragment(start, end, fcnt); fragments[fno] = std::string(copy, start); return fragments; }
Definesfragment
(links are to index).Used below; previous definition.
<urls.cc
>=
#include <string>
#include <ostream>
#include <cctype>
#include <iostream>
#include "urls.h"
typedef const char * ccp;
typedef const std::string * csp;
// Deja vu c++ style
<url forward definitions>
static bool same_string(const std::string &, const std::string &);
static std::string * fragment(ccp, ccp, unsigned &);
<print-url procedures>
static bool
same_string(const std::string & s1, const std::string & s2) {
// Return true iff s1 is equal to s2 when case is ignored.
if (s1.size() != s2.size())
return false;
for (unsigned i = 0; i < s2.size(); i++)
if (tolower(s1[i]) != tolower(s2[i]))
return false;
return true;
}
#ifdef TESTING
// g++ -o urls-testing -gstabs -ansi -pedantic -DTESTING urls.cc && urls-testing
#include <cassert>
#include <sstream>
int main() {
const char * cp = "";
unsigned fcnt = 0;
csp fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 0);
cp = "hello";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 1);
assert(fragments[0] == cp);
delete [] fragments;
cp = "hello world";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 2);
assert(fragments[0] == "hello");
assert(fragments[1] == "world");
delete [] fragments;
cp = " \t\nhello world\t \n \t";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 2);
assert(fragments[0] == "hello");
assert(fragments[1] == "world");
delete [] fragments;
cp = " \t\n\"hello world!\"\t \n \t";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 1);
assert(fragments[0] == "\"hello world!\"");
delete [] fragments;
cp = " \t\nhello world!\t \n \t";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(fcnt == 3);
assert(fragments[0] == "hello");
assert(fragments[1] == "world");
assert(fragments[2] == "!");
delete [] fragments;
cp = " < a href = hello > ";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(6 == fcnt);
csp tag_end, tag_start = fragments;
assert(find_anchor_tag(tag_start, tag_end, fragments + 6));
assert(tag_start == fragments);
assert(tag_end == fragments + 5);
delete [] fragments;
cp = "<ahref=hello>";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
assert(5 == fcnt);
tag_start = fragments;
assert(not find_anchor_tag(tag_start, tag_end, fragments + 5));
delete [] fragments;
cp = " < a href = \"hello\" > ";
fcnt = 0;
fragments = fragment(cp, cp + strlen(cp), fcnt);
csp href = extract_href_value(fragments, fragments + 6);
assert(fragments + 4 == href);
delete [] fragments;
# define url "\"www.monmouth.edu\""
cp = "< a href = " url " >";
resource document = { strlen(cp), const_cast<char *>(cp) };
std::ostringstream oss;
print_urls(oss, document);
assert(oss.str() == "www.monmouth.edu\n");
cp = "<html> < a href = " url " >";
document.size = strlen(cp);
document.data = const_cast<char *>(cp);
oss.str("");
print_urls(oss, document);
assert(oss.str() == "www.monmouth.edu\n");
}
#endif
Definesurls.cc
(links are to index).This code is written to a file (or else not used).
<urls.h
>=
#ifndef _urls_h_defined_
#define _urls_h_defined_
#include "get.h"
// Look for anchor tags in the given resource and print the href values
// to the given output stream.
extern void print_urls(std::ostream &, const resource &);
#endif
Definesurls.h
(links are to index).This code is written to a file (or else not used).
<main.cc
>=
#include <cstdlib>
#include <iostream>
#include "get.h"
#include "urls.h"
int
main(int argc, char * argv[]) {
if (argc != 2) {
std::cerr << "Command format is \"" << argv[0] << " url\".\n";
exit(EXIT_FAILURE);
}
int return_value;
resource document;
const std::string emsg = get(argv[1], document);
if (not emsg.empty()) {
std::cerr << "Error during url get, " << emsg << ".\n";
return_value = EXIT_FAILURE;
}
else {
print_urls(std::cout, document);
return value = EXIT_SUCCESS;
}
delete [] document.data;
return return_value;
}
Definesmain.cc
(links are to index).This code is written to a file (or else not used).
main.cc
>: D1
urls.cc
>: D1
urls.h
>: D1
This page last modified on 3 December 2003.