D
Dave Townsend
Hi,
Can anybody help me with the following piece of code?
The purpose behind the code is to parse HTML files, strip out
the tags and return the text between tags. This is part of a larger
application which will perform "searches" for text values in a
directory of html files, trying to match only the non-tagged text
in the documents.
My approach was to customize a istreambuf_iterator which would
iterate over the text characters in the HTML bodies, and ignore the html
tags. I did
this by deriving from the std istreambuf_iterator, overriding the ++
operators to parse the underlying input to screen the data and return the
next valid character. The iteration seems to work ok on some simple
HTML test data - I've seen the tags stripped away and the text body get
through.
Now here's the clever part . I thought I could use the std algorithm
search
directly, plug in the range of the stream and the range of the text string
to
look for, and there you go. Unfortunately, this doesn't work ! The search
always fails to find the search string.
Am I missing something about the search? I thought the requirements for
search only required the iterators be forward iterators - is there something
I've
misunderstood here ? I'd like to use this code, it strikes me as a nice
solution
utilizing the power of STL and the stream classes.
Many thanks for any help!
dave.
#include <algorithm>
#include <fstream>
#include <istream>
#include <iostream>
#include <algorithm>
#include <vector>
using namespace std;
class htmlstreambuf_iterator : public istreambuf_iterator<char>
{
public:
htmlstreambuf_iterator( ifstream& stream)
:istreambuf_iterator<char>(stream),_intag(false)
{
// advance to first "legal" character in input.
while(
!istreambuf_iterator<char>::equal(istreambuf_iterator<char>()) )
{
char c = istreambuf_iterator<char>:perator*();
if ( c == '<' )
{
_intag=true;
incr();
break;
}
else if (!_intag )
{
break;
}
}
}
htmlstreambuf_iterator( )
:istreambuf_iterator<char>(), _intag(false)
{}
htmlstreambuf_iterator( const htmlstreambuf_iterator& iter)
:istreambuf_iterator<char>( iter), _intag(false)
{}
char operator*()
{
printf("operator*() %c\n", istreambuf_iterator<char>:perator*() );
return istreambuf_iterator<char>:perator*();
}
const char* operator->()
{
return istreambuf_iterator<char>:perator->();
}
htmlstreambuf_iterator& operator++()
{
incr();
return *this;
}
htmlstreambuf_iterator operator++(int)
{
htmlstreambuf_iterator tmp=*this;
incr();
return tmp;
}
bool equal( htmlstreambuf_iterator& rhs )
{
return istreambuf_iterator<char>::equal(rhs);
}
private:
void incr()
{
while( !istreambuf_iterator<char>::equal(istreambuf_iterator<char>()) )
{
// advance and skip over any tag data to next legal character.
istreambuf_iterator<char>:perator++();
char c = istreambuf_iterator<char>:perator*();
if ( c == '<' && !_intag )
{
_intag = true;
}
else if ( c == '>' && _intag )
{
_intag = false;
}
else if (!_intag )
{
break;
}
}
}
private:
bool _intag;
};
int main(int argc, char* argv[])
{
ifstream input( argv[1] );
htmlstreambuf_iterator iter(input);
htmlstreambuf_iterator iterend;
string searchtext("D");
htmlstreambuf_iterator pos = search( iter, iterend,
searchtext.begin(), searchtext.end() );
// pos always points to end of input. !!!
bool found = pos != iterend ;
return 0;
}
Can anybody help me with the following piece of code?
The purpose behind the code is to parse HTML files, strip out
the tags and return the text between tags. This is part of a larger
application which will perform "searches" for text values in a
directory of html files, trying to match only the non-tagged text
in the documents.
My approach was to customize a istreambuf_iterator which would
iterate over the text characters in the HTML bodies, and ignore the html
tags. I did
this by deriving from the std istreambuf_iterator, overriding the ++
operators to parse the underlying input to screen the data and return the
next valid character. The iteration seems to work ok on some simple
HTML test data - I've seen the tags stripped away and the text body get
through.
Now here's the clever part . I thought I could use the std algorithm
search
directly, plug in the range of the stream and the range of the text string
to
look for, and there you go. Unfortunately, this doesn't work ! The search
always fails to find the search string.
Am I missing something about the search? I thought the requirements for
search only required the iterators be forward iterators - is there something
I've
misunderstood here ? I'd like to use this code, it strikes me as a nice
solution
utilizing the power of STL and the stream classes.
Many thanks for any help!
dave.
#include <algorithm>
#include <fstream>
#include <istream>
#include <iostream>
#include <algorithm>
#include <vector>
using namespace std;
class htmlstreambuf_iterator : public istreambuf_iterator<char>
{
public:
htmlstreambuf_iterator( ifstream& stream)
:istreambuf_iterator<char>(stream),_intag(false)
{
// advance to first "legal" character in input.
while(
!istreambuf_iterator<char>::equal(istreambuf_iterator<char>()) )
{
char c = istreambuf_iterator<char>:perator*();
if ( c == '<' )
{
_intag=true;
incr();
break;
}
else if (!_intag )
{
break;
}
}
}
htmlstreambuf_iterator( )
:istreambuf_iterator<char>(), _intag(false)
{}
htmlstreambuf_iterator( const htmlstreambuf_iterator& iter)
:istreambuf_iterator<char>( iter), _intag(false)
{}
char operator*()
{
printf("operator*() %c\n", istreambuf_iterator<char>:perator*() );
return istreambuf_iterator<char>:perator*();
}
const char* operator->()
{
return istreambuf_iterator<char>:perator->();
}
htmlstreambuf_iterator& operator++()
{
incr();
return *this;
}
htmlstreambuf_iterator operator++(int)
{
htmlstreambuf_iterator tmp=*this;
incr();
return tmp;
}
bool equal( htmlstreambuf_iterator& rhs )
{
return istreambuf_iterator<char>::equal(rhs);
}
private:
void incr()
{
while( !istreambuf_iterator<char>::equal(istreambuf_iterator<char>()) )
{
// advance and skip over any tag data to next legal character.
istreambuf_iterator<char>:perator++();
char c = istreambuf_iterator<char>:perator*();
if ( c == '<' && !_intag )
{
_intag = true;
}
else if ( c == '>' && _intag )
{
_intag = false;
}
else if (!_intag )
{
break;
}
}
}
private:
bool _intag;
};
int main(int argc, char* argv[])
{
ifstream input( argv[1] );
htmlstreambuf_iterator iter(input);
htmlstreambuf_iterator iterend;
string searchtext("D");
htmlstreambuf_iterator pos = search( iter, iterend,
searchtext.begin(), searchtext.end() );
// pos always points to end of input. !!!
bool found = pos != iterend ;
return 0;
}