I suspect the handler is being called multiple times, each time with a
different part of the original text. You can test this hypothesis by
putting a debug statement in here.
You seem to be correct - I have simplified the code and placed a
simple html file (below) in d:\fred and the result appears in
d:\fred\jim and indeed the <p> ... </p>text is there twice. Any ideas
why?
Thanks
Geoff
package MyParser;
use base qw(HTML:
arser);
use strict;
use diagnostics;
my ($in_heading,$in_p,$fh);
sub register_fh {
$fh = $_[1];
}
sub reset { ($in_heading,$in_p)=(0,0)}
sub start {
my ( $self, $tagname, $attr, undef, $origtext ) = @_;
if ( $tagname eq 'h2' ) {
$in_heading = 1;
return;
}
if ( $tagname eq 'p' ) {
$in_p = 1;
return;
}
}
sub end {
my ( $self, $tagname, $origtext ) = @_;
if ( $tagname eq 'h2' ) {
$in_heading = 0;
return;
}
if ( $tagname eq 'p' ) {
$in_p = 0;
return;
}
}
sub text {
my ( $self, $origtext ) = @_;
print $fh "<h2>$origtext</h2> \n" if $in_heading;
print $fh "<p>$origtext</p> \n" if $in_p;
}
package main;
use File::Find;
my $dir = "d:/fred";
my $parser = MyParser->new;
find sub {
return if -d $_;
my $name = $_;
open( OUT, ">>d:/fred/jim/$name" )
|| die "can't open d:/fred/jim/$name: $!";
print OUT ("<html><head><title>test</title>
</head><body> \n");
$parser->register_fh(\*OUT);
$parser->parse_file($_);
$parser->reset;
print OUT ("</body></html> \n");
}, $dir;
--------------- html file ---------------------------
<html>
<head>
<title>test</title>
</head>
<body>
<h2>test file</h2>
<p>The is some text which I am using to test whether para.pl using
HTML:
arser will output all of the text in this paragraph in one
paragraph, or, in two smaller paragraphs.</p>
</body>
</html>