slurp not working? ideas please!

R

Richard Morse

Geoff Cox said:
Anno

the code as of now follows - I am confused re how the OOP fits in with
the File::Find ... but as I am using html files in the 1 folder have
removed the File::Find part .. but still get the warning re
uninitialized value in pattern match for the

if ($next3 =~ /\$i\<(\d+);/) {

in sub classroomnotes

any ideas why?

Try doing the following:
package MyParser;
use base qw(HTML::parser);
[snip]
sub classroomnotes {

my ($pattern) = @_;

# print ("\$pattern has value $pattern \n");

open (INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php") ||
die "cannot open d:/a-keep9/short-nondb/allphp/allphp2.php \n";

my $line = <INNN>;

# since I don't see a chomp/chop anywhere, I'm assuming
# that each line has a newline at the end already.
print "line: ", $line;
while (<INNN>){
last if /$pattern/;
}
my ($curr, $next1, $next2, $next3) = <INNN>;

print "curr: ", $curr;
print "next1: ", $next1;
print "next2: ", $next2;
print "next3: ", $next3;
print "\n";
close (INNN);

if ($next3 =~ /\$i\<(\d+);/) {
my $nn = $1;
print OUT ("<td valign='top'> \n");
for ($c=1;$c<$nn;$c++) {
print OUT ('<a href="'. $pattern . "-doc" . $c . ".zip" . '">' .
"Document$c" . "</a><br>" . "\n");
}
print OUT ("</td></tr>\n");
}
}

What does this tell you?

Ricky
 
T

Tassilo v. Parseval

Also sprach Richard Morse:
I'm not sure exactly how you would get the directory name here (I've not
had the pleasure of using File::Find yet), but wouldn't it be better to
do something like:

find sub {
return if -d (some_function_to_get_cwd() . $_);
...
}

? This would handle any other miscellaneous directories that appear...

Yes, a -d check would be more generic and not skip dotfiles (for example).
As the OP wants to parse a certain type of file, he might also just
allow only files with a certain extension as in

find sub {
return unless /\.html$/;
...
}

Btw:

some_function_to_get_cwd() . $_

becomes

$File::Find::name

in the File::Find lingo.

Tassilo
 
G

Geoff Cox

I'm not sure exactly how you would get the directory name here (I've not
had the pleasure of using File::Find yet), but wouldn't it be better to
do something like:

find sub {
return if -d (some_function_to_get_cwd() . $_);
...
}

Richard,

Thanks for the idea - will give it a try.

Cheers

Geoff
 
G

Geoff Cox

Which order above? The order above is <h2>, <p> and finally <option>.

Even if I add a second or third set of data, the order remains intact
for me.

I still cannot reproduce this. :)

Tassilo,

Just to say - I had to go away overnight so will now try and come up
with something you can test!

Cheers

Geoff
 
G

Geoff Cox

Even if I add a second or third set of data, the order remains intact
for me.

I still cannot reproduce this. :)

Tassilo

If I use the code below on the html file called test.htm (contents
below) which is in the d:\a-keep9\short-nondb\oldshort3 folder amd
db.txt (contents below) which is the d:\a-keep9\short-nondb\db
I get

<h2>test first</h2>
<p> ashjk hjk etc first </p>

<h2>test second</h2>
<p> ahsj hk jk etcsecond </p>

followed by the 2 option sections

instead of

<h2>test first</h2>
<p> hdjsadh jkashd jk etc first</p>
<option etc

<h2>test fsecond</h2>
<p> jak jahd jaksd j ksecond </p>
<option etc

Any use?

Cheers

Geoff

------------- test.htm ------------

<html>
<head><title>test</title>
</head>
<body>

<h2> test first </h2>

<p> ahsjdhj akdh hasdj jakhd k ada
dkasdjl akld kajd kl as
aksjd lajksd aldsj first</p>

<option
value="docs/gcse/student-activities/business-location">Business
Location</option>

<h2> test second</h2>

<p> ahdjk ahsdjh ahd
akjdk akld asjd la
adj ajsdk lk second </p>

<option
value="docs/gcse/student-activities/business-location">Business
Location</option>

</body>
</html>

------------------------------ db.txt ----------------------

INSERT INTO total VALUES
('docs/gcse/student-activities/finance','<h3>GCSE Student Activities -
Finance</h3>1. methods of raising finance<br>2. don\'t call them
money<br>3. coloured breakeven point<br>4. The mug activity<br>5.
Cashflow and inflow<br>6. external influences on breakeven<br>7.
balance sheet<br>8. which account will you get me from');
INSERT INTO total VALUES
('docs/gcse/student-activities/marketing','<h3>GCSE Student Activities
- Marketing</h3>1. adverts-activity<br>. demand-and-supply<br>3.
market-research<br>4. market-segments<br>5. product-life-cycle<br>6.
promotions-and-business-objectives<br>7. supply-and-demand-2<br>8.
the-marketing-mix-and-types-of-economy<br>9. the-marketing-mix<br>10.
types-of-market-and-demand-and-supply<br>11.
types-of-ownership-and-the-marketing-mix');

-------------------------------------- the code -------------

package MyParser;
use base qw(HTML::parser);
use strict;
use diagnostics;

my ($in_heading,$in_p, $fh);

sub register_fh {
# $_[0] contains the parser object
$fh = $_[1];
}

sub reset { ($in_heading,$in_p)=(0,0)}

sub start {

my ( $self, $tagname, $attr, undef, $origtext ) = @_;

if ( $tagname eq 'h2' ) {
$in_heading = 1;
return;
}

if ( $tagname eq 'p' ) {
$in_p = 1;
return;
}

if ( $tagname eq 'option' ) {

print ("\$origtext has value $origtext \n");

main::choice( $attr->{ value } );

}

}

sub end {
my ( $self, $tagname, $origtext ) = @_;
if ( $tagname eq 'h2' ) {
$in_heading = 0;
return;
}

if ( $tagname eq 'p' ) {
$in_p = 0;
return;
}
}

sub text {
my ( $self, $origtext ) = @_;
print $fh "<h2>$origtext</h2> \n" if $in_heading;
print $fh "<p>$origtext</p> \n" if $in_p;

}

package main;

use File::Find;

my $dir = "d:/a-keep9/short-nondb/oldshort3";
my $parser = MyParser->new;

find sub {
return if /^\.\.?/; # catches "." and ".."
my $name = $_;
open( OUT, ">>d:/a-keep9/short-nondb/short/members2/$name" )
|| die "can't open d:/a-keep9/short-nondb/short/members2/$name:
$!";

print OUT ("<html><head><title>test</title>
<link rel=\"STYLESHEET\" type=\"text/css\"
href=\"assets/style/hala-1.css\">
</head><body> \n");
print OUT ("<table width='100%' border='1'> \n");

# undef $/;

$parser->register_fh(\*OUT);
$parser->parse_file($_);
$parser->reset;

print OUT ("</body></html> \n");

}, $dir;

sub choice {
my ($path) = @_;
if ( $path =~ /docs\/btec-first/ ) {
intro($path);
btecfirst($path);
}
elsif ( $path =~ /docs\/aslevel\/classroom-notes/ ) {
intro($path);
aslevelclassroomnotes($path);
}
elsif ( $path =~ /docs\/avce\/assignments/ ) {
intro($path);
avceassignments($path);
}
elsif ( $path =~
/docs\/aslevel\/simulations\/second-severn-bridge/ ) {
intro($path);
aslevelsimulationssevern($path);
}
elsif ( $path =~ /docs\/aslevel\/debates\/wind-farm-debate/ ) {
intro($path);
asleveldebateswindfarm($path);
}
elsif ( $path =~ /docs\/economics\/section1/ ) {
intro($path);
economicssection1($path);
}
elsif ( $path =~ /docs\/economics\/section2/ ) {
intro($path);
economicssection2($path);
}
elsif ( $path =~ /docs\/economics\/section3/ ) {
intro($path);
economicssection3($path);
}
elsif ( $path =~ /docs\/gcse\/classroom-notes/ ) {
intro($path);
gcseclassroomnotes($path);
}
elsif ( $path =~
/docs\/gcse\/student-activities\/games\/ice-lolly/ ) {
intro($path);
gcsegamesicelolly($path);
}
elsif ( ( $path =~ /docs\/gnvq-int\/assignments/ )
&& ( $path !~
/gnvq-int\/assignments\/gnvq-int-write-assignment/ ) )
{
intro($path);
gnvqintassignments($path);
}
elsif ( $path =~ /docs\/vgcse\/course-units/ ) {
intro($path);
vgcsecourseunits($path);
}
elsif ( $path =~
/docs\/gcse\/student-activities\/business-location/ ) {
intro($path);
gcsestudentactivitiesbusinesslocation($path);
}
elsif ( $path =~

/docs\/gcse\/student-activities\/business-structure-decisions/ )
{
intro($path);
gcsestudentactivitiesbusinessstructuredecisions($path);
}
elsif ( $path =~ /docs\/gcse\/student-activities\/finance/ ) {
intro($path);
gcsestudentactivitiesfinance($path);
}
elsif ( $path =~ /docs\/gcse\/student-activities\/marketing/ ) {
intro($path);
gcsestudentactivitiesmarketing($path);
}
elsif ( $path =~ /docs\/gcse\/student-activities\/people-at-work/
) {
intro($path);
gcsestudentactivitiespeopleatwork($path);
}
elsif ( $path =~ /docs\/gcse\/student-activities\/production/ ) {
intro($path);
gcsestudentactivitiesproduction($path);
}
else {
intro($path);
other($path);
}

}

sub intro {

my ($pathhere) = @_;
open( INN, "d:/a-keep9/short-nondb/db/db.txt" );
my $lineintro;

while ( defined( $lineintro = <INN> ) ) {
if ( $lineintro =~ /$pathhere','(.*?)'\)\;/ ) {
print OUT ("<tr><td>$1 <p> </td>\n");
}
}
}

sub btecfirst {

my ($pattern) = @_;
my $linee = $pattern;
my $c = 0;
$linee =~ /.*unit(\d).*?chap(\d)/;
my $u = $1;
my $chap = $2;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d);/ ) {
my $nn = $1;
print OUT ("<td valign='top'>\n");
for ( my $c = 1 ; $c < $nn ; $c++ ) {
print OUT (
'<a href="' . $pattern . "/unit" . $u . "-chap" .
$chap . "-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub aslevelclassroomnotes {

my ($pattern) = @_;
my $c;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

my $line = <INNN>;

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern . "-doc" . $c .
".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub other {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Document"
. "</a><br>"
. "\n" );
print OUT ("</td></tr>\n");

}

sub avceassignments {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Document"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern . "-grid" . ".zip" . '">' .
"Grid"
. "</a><br>"
. "\n" );
print OUT ("</td></tr>\n");

}

sub aslevelsimulationssevern {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Teacher's pack"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern . "-sp" . ".zip" . '">'
. "Student's Pack"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern . "-swb" . ".zip" . '">'
. "Student's Workbook"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern . "-dbp" . ".zip" . '">'
. "Debriefing Pack"
. "</a><br>"
. "\n" );

print OUT ("</td></tr>\n");

}

sub asleveldebateswindfarm {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Teacher's pack"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-a-company" . ".zip" . '">'
. "Adviser's Pack - Company"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-a-council" . ".zip" . '">'
. "Adviser's Pack - Council"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-s-company" . ".zip" . '">'
. "Student's Pack - Company"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-s-council" . ".zip" . '">'
. "Student's Pack - Council"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-s-workbook" . ".zip" . '">'
. "Student's Workbook"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-arbitrator" . ".zip" . '">'
. "Arbitrator's Pack"
. "</a><br>"
. "\n" );

print OUT ("</td></tr>\n");

}

sub economicssection1 {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/section1-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub economicssection2 {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/section2-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub economicssection3 {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/section3-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcseclassroomnotes {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern . "-doc" . $c .
".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsegamesicelolly {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Teacher's Pack"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-runners" . ".zip" . '">'
. "Runner's Pack"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern
. "-adjudicators" . ".zip" . '">'
. "Adjudicator's Pack"
. "</a><br>"
. "\n" );

print OUT ("</td></tr>\n");

}

sub gnvqintassignments {

my ($pattern) = @_;
print OUT ("<td valign='top'> \n");
print OUT ( '<a href="' . $pattern . ".zip" . '">'
. "Document"
. "</a><br>"
. "\n" );
print OUT ( '<a href="' . $pattern . "-grid" . ".zip" . '">' .
"Grid"
. "</a><br>"
. "\n" );
print OUT ("</td></tr>\n");

}

sub vgcsecourseunits {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern . "-chap" . $c .
".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub para {
my ($name) = @_;
my $line;
open( INPARA, "d:/a-keep9/short-nondb/progs/para2/$name" );

undef $/;
$line = <INPARA>;

print OUT ("<tr><td colspan='2'> \n");
print OUT $line;
print OUT ("</td></tr> \n");

$/ = "\n";
}

sub getpara {
local $/ = "\0d\0a";

my ($name) = @_;
my $line;

open( GETPARA, "d:/a-keep9/short-nondb/old-short/$name" );
open( OUTPARA, ">>d:/a-keep9/short-nondb/progs/para2/$name" );

while ( defined( $line = <GETPARA> ) ) {
if ( $line =~ /<p>(.*?)<\/p>/s ) {
print OUTPARA ("$1 \n");
}
}

#close(GETPARA);
#close(OUTPARA);
}

sub gcsestudentactivitiesbusinesslocation {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/location-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiesbusinessstructuredecisions {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/bsd-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiesfinance {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/finance-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiesmarketing {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/marketing-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiespeopleatwork {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/people-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiesproduction {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/production-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}
 
T

Tassilo v. Parseval

Also sprach Geoff Cox:
Tassilo

If I use the code below on the html file called test.htm (contents
below) which is in the d:\a-keep9\short-nondb\oldshort3 folder amd
db.txt (contents below) which is the d:\a-keep9\short-nondb\db
I get

<h2>test first</h2>
<p> ashjk hjk etc first </p>

<h2>test second</h2>
<p> ahsj hk jk etcsecond </p>

followed by the 2 option sections

instead of

<h2>test first</h2>
<p> hdjsadh jkashd jk etc first</p>
<option etc

<h2>test fsecond</h2>
<p> jak jahd jaksd j ksecond </p>
<option etc

Any use?

Hardly. Your script does not print anything option related. How should
it? Please show me the part of your code which is supposed to produce

<option ...>

You'll notice that nothing of that kind exists.

Other than that, you posted far too much code. I usually can't be
bothered to create several directories and files just to realize that
the posted script (which includes many functions that are never called
for the sample data) expects a different directory layout and hence wont
work without manual intervention.

Tassilo
 
G

Geoff Cox

Hardly. Your script does not print anything option related. How should
it? Please show me the part of your code which is supposed to produce

Tassilo,

From the following, in sub start

if ( $tagname eq 'option' ) {

main::choice( $attr->{ value } );

}

the flow goes to say, the

sub choice

elsif ( $path =~ /docs\/gcse\/student-activities\/finance/ ) {
intro($path);
gcsestudentactivitiesfinance($path);
}

then to

sub gcsestudentactivitiesfinance {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/finance-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

Which makes clear that I would have to let you have a sample part of
yet another file, allphp2.php !!

Not sure how I could give you anything smaller than the total set of
files in order to test for yourself .. ??

Cheers

Geoff
 
T

Tassilo v. Parseval

Also sprach Geoff Cox:
Tassilo,

From the following, in sub start

if ( $tagname eq 'option' ) {

main::choice( $attr->{ value } );

}

the flow goes to say, the

sub choice

elsif ( $path =~ /docs\/gcse\/student-activities\/finance/ ) {
intro($path);
gcsestudentactivitiesfinance($path);
}

then to

sub gcsestudentactivitiesfinance {

my ($pattern) = @_;

my $c = 0;

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/finance-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

Which makes clear that I would have to let you have a sample part of
yet another file, allphp2.php !!

Well, I did follow the control flow of your program up to this point.
But not even in intro() or gcsestudentactivitiesfinance() "<option..."
shows up.

I grepped your source code for 'option' and I got exactly one match,
which is in MyParser::start().

Thus, I simply cannot see how your program gives the output that you
described. <option ...> doesn't come out in the wrong order. It doesn't
come out at all. I don't think that the existance of allphp2.php would
change anything about that.
Not sure how I could give you anything smaller than the total set of
files in order to test for yourself .. ??

For one, it would help to remove the File::Find dependecy and post code
that only works on one input file. That way, someone could put the input
file and the two other files opened by your program into one directory.
The HTML input could even be appended to the program behind the
'__END__' marker so make it more self-contained. Your program could then
do a

$parser->parse_file(\*DATA);

Changing absolute paths to relative ones is also a good idea (in your
program and your accordingly in your data).

Furthermore, functions not called for the sample data should be thrown
out. Likewise branches in main::choice() that your program wont enter.

Tassilo
 
T

Tad McClellan

Geoff Cox said:
sub gcsestudentactivitiesfinance {


Quit joking around, the first of the month was a few weeks ago.

Thatisjusttoohardtoread.

(That_is_just_too_hard_to_read.)


Use the underscores Luke!

open( INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php" );


<broken-record>
You should always, yes *always*, check the return value from open().
</broken-record>
 
G

Geoff Cox

I grepped your source code for 'option' and I got exactly one match,
which is in MyParser::start().

Thus, I simply cannot see how your program gives the output that you
described. <option ...> doesn't come out in the wrong order. It doesn't
come out at all. I don't think that the existance of allphp2.php would
change anything about that.

Tassilo,

OK - my fault - I should have said that the output (using the value
for option from the allphp2.php file) would be, for example,

<a
href="docs/gcse/student-activities/business-location/location-doc1.zip">Document1</a><br>

but I was using the <option etc as shorthand to demonstrate that the
order of output is wrong ...

I will try to act on your other suggestions!

Cheers

Geoff
 
G

Geoff Cox

For one, it would help to remove the File::Find dependecy and post code
that only works on one input file. That way, someone could put the input
file and the two other files opened by your program into one directory.

Tassilo,

OK - have tried to follow your suggestions in cutting down the code
etc.

The code and the 3 files test.htm, db.txt and allphp2.php can be in
the same folder. The results will appear in results.htm. Hope this can
allow you to see why the oder in results.htm is not the same as that
in test.htm ...

Cheers

Geoff

---------------tass2.pl ---------------------

package MyParser;
use base qw(HTML::parser);
use strict;
use diagnostics;

my ($in_heading,$in_p, $fh);

sub register_fh {

$fh = $_[1];
}

sub reset { ($in_heading,$in_p)=(0,0)}

sub start {

my ( $self, $tagname, $attr, undef, $origtext ) = @_;

if ( $tagname eq 'h2' ) {
$in_heading = 1;
return;
}

if ( $tagname eq 'p' ) {
$in_p = 1;
return;
}

if ( $tagname eq 'option' ) {

main::choice( $attr->{ value } );

}

}

sub end {
my ( $self, $tagname, $origtext ) = @_;
if ( $tagname eq 'h2' ) {
$in_heading = 0;
return;
}

if ( $tagname eq 'p' ) {
$in_p = 0;
return;
}
}

sub text {
my ( $self, $origtext ) = @_;
print $fh "<h2>$origtext</h2> \n" if $in_heading;
print $fh "<p>$origtext</p> \n" if $in_p;

}

package main;

my $parser = MyParser->new;

my $name = "test.htm";
open( OUT, ">>results.htm" )
|| die "$name: $!";

print OUT ("<html><head><title>test</title>
</head><body> \n");
print OUT ("<table width='100%' border='1'> \n");

$parser->register_fh(\*OUT);
$parser->parse_file($name);
$parser->reset;

print OUT ("</body></html> \n");

sub choice {
my ($path) = @_;

if ( $path =~ /docs\/gcse\/student-activities\/finance/ ) {
intro($path);
gcsestudentactivitiesfinance($path);
} elsif ( $path =~ /docs\/gcse\/student-activities\/marketing/ ) {
intro($path);
gcsestudentactivitiesmarketing($path);
}

}

sub intro {

my ($pathhere) = @_;
open( INN, "db.txt" );
my $lineintro;

while ( defined( $lineintro = <INN> ) ) {
if ( $lineintro =~ /$pathhere','(.*?)'\)\;/ ) {
print OUT ("<tr><td>$1 <p> </td>\n");
}
}
}


sub gcsestudentactivitiesfinance {

my ($pattern) = @_;

my $c = 0;

open( INNN, "allphp2.php" )
|| die "can't open allphp2.php: $! \n";

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/finance-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}

sub gcsestudentactivitiesmarketing {

my ($pattern) = @_;

my $c = 0;

open( INNN, "allphp2.php" )
|| die "can't open allphp2.php: $! \n";

while (<INNN>) {
last if /$pattern/;
}
my ( $curr, $next1, $next2, $next3 ) = <INNN>;
close(INNN);

if ( $next3 =~ /\$i\<(\d+);/ ) {

my $nn = $1;

print OUT ("<td valign='top'> \n");
for ( $c = 1 ; $c < $nn ; $c++ ) {
print OUT ( '<a href="' . $pattern
. "/marketing-doc"
. $c . ".zip" . '">'
. "Document$c"
. "</a><br>"
. "\n" );
}
print OUT ("</td></tr>\n");
}
}


-------------- test.htm ----------------------

<html>
<head><title>test</title>
</head>
<body>

<h2>Finance </h2>

<p> description re Finance document</p>

<option
value="docs/gcse/student-activities/finance">Marketing</option>

<h2>Marketing</h2>

<p> description re marketing document </p>

<option
value="docs/gcse/student-activities/marketing">Marketing</option>

</body>
</html>


-------------- db.txt -------------------------

INSERT INTO total VALUES
('docs/gcse/student-activities/finance','<h3>GCSE Student Activities -
Finance</h3>1. methods of raising finance<br>2. don\'t call them
money<br>3. coloured breakeven point<br>4. The mug activity<br>5.
Cashflow and inflow<br>6. external influences on breakeven<br>7.
balance sheet<br>8. which account will you get me from');
INSERT INTO total VALUES
('docs/gcse/student-activities/marketing','<h3>GCSE Student Activities
- Marketing</h3>1. adverts-activity<br>. demand-and-supply<br>3.
market-research<br>4. market-segments<br>5. product-life-cycle<br>6.
promotions-and-business-objectives<br>7. supply-and-demand-2<br>8.
the-marketing-mix-and-types-of-economy<br>9. the-marketing-mix<br>10.
types-of-market-and-demand-and-supply<br>11.
types-of-ownership-and-the-marketing-mix');




-------------- allphp2.pl --------------------

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>
<head>
<title>Untitled</title>

</head>

<body>
<?php
@require(dirname(__FILE__) . '/../../../config_php/config.php');

if (@$_POST['submit'] == 1)
{
if ($_POST['term'] == "") {
echo "<font color=#000000>No document chosen please select one</a> <p>
\n";
//echo "<a href='avce-assignments.php'><font color=#000000>Search
again</font></a> \n";

} else {
$query = "select * from total where " . $_POST['searchfield'] . "=" .
"'". $_POST['term'] . "'";
/* connect to the server */
$link = mysql_connect(
$conf['sql']['host'],
$conf['sql']['user'],
$conf['sql']['pass']
) or die ("cannot make connection to {$conf['sql']['db']}");
/* select database */
if (mysql_select_db($conf['sql']['db'],$link)) {
/* make query */
$result = mysql_query($query, $link);
if ($result) {
if (mysql_num_rows($result) == "0") {
echo "<font color=#000000>Not found - try again with a different
term</a> <p>\n";
// echo "<a href='assignments.htm'><font color=#000000>Search
again</font></a> \n";
} else {

// echo "<B>Your query found ".mysql_num_rows($result)." hit(s)</B>";

echo "<TABLE width='100%' BORDER='1' width='100%' cellpadding='10'
align='center'>\n";
while ($row[] = mysql_fetch_array($result)) {
printf("<td valign='top'>%s &nbsp;</td>",$row[0]["intro"]);

}

switch ($_POST['term']) {

case $_POST['term'] == "docs/gcse/student-activities/finance":
{
echo "<td valign='top' width='10%'>";
echo "<b>Select Document</b><br>";
for ($i=1;$i<9;$i++) {
echo "<a href='". $_POST['term'] . "/finance-doc" . $i .".zip" .
"'>$i</a>";
echo "<br>";
}
echo "</td></tr>";
echo "</TABLE>\n";
}

break;


case $_POST['term'] == "docs/gcse/student-activities/marketing":
{
echo "<td valign='top' width='10%'>";
echo "<b>Select Document</b><br>";
for ($i=1;$i<12;$i++) {
echo "<a href='". $_POST['term'] . "/marketing-doc" . $i .".zip" .
"'>$i</a>";
echo "<br>";
}
echo "</td></tr>";
echo "</TABLE>\n";
}

break;




}


}
}
}
}

}
?>


</body>
</html>
 
J

Joe Smith

Richard said:
find sub {
return if -d (some_function_to_get_cwd() . $_);
...
}

Unless you explictly invoke no_chdir=>1 in the options to find(),
the current working directory is the one that contains the file.
Therefore you can use simply
return if -d $_;
to check if "$File::Find::dir/$_" is a subdirectory.
 
G

Geoff Cox

Thanks for that Richard.

Cheers

Geoff

Unless you explictly invoke no_chdir=>1 in the options to find(),
the current working directory is the one that contains the file.
Therefore you can use simply
return if -d $_;
to check if "$File::Find::dir/$_" is a subdirectory.
 
G

Geoff Cox

Try doing the following:

Ricky,

sorry I missed your post ... Tassilo has put me straight re the OOP
part (see his posts here).

Cheers

Geoff


package MyParser;
use base qw(HTML::parser);
[snip]
sub classroomnotes {

my ($pattern) = @_;

# print ("\$pattern has value $pattern \n");

open (INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php") ||
die "cannot open d:/a-keep9/short-nondb/allphp/allphp2.php \n";

my $line = <INNN>;

# since I don't see a chomp/chop anywhere, I'm assuming
# that each line has a newline at the end already.
print "line: ", $line;
while (<INNN>){
last if /$pattern/;
}
my ($curr, $next1, $next2, $next3) = <INNN>;

print "curr: ", $curr;
print "next1: ", $next1;
print "next2: ", $next2;
print "next3: ", $next3;
print "\n";
close (INNN);

if ($next3 =~ /\$i\<(\d+);/) {
my $nn = $1;
print OUT ("<td valign='top'> \n");
for ($c=1;$c<$nn;$c++) {
print OUT ('<a href="'. $pattern . "-doc" . $c . ".zip" . '">' .
"Document$c" . "</a><br>" . "\n");
}
print OUT ("</td></tr>\n");
}
}

What does this tell you?

Ricky
 
K

ko

Geoff said:
Tassilo,

OK - have tried to follow your suggestions in cutting down the code
etc.

The code and the 3 files test.htm, db.txt and allphp2.php can be in
the same folder. The results will appear in results.htm. Hope this can
allow you to see why the oder in results.htm is not the same as that
in test.htm ...

Cheers

Geoff

[snip code/test files]

Before starting, note that this is just a suggestion/alternate solution.
I also thought the examples given by Tassilo in one of your other
threads were excellent and learned a lot, but being a newbie to
OO/subclassing/callbacks it seems like you may be focusing too much on
those parts of your code. I say that because I ran the code you supplied
and the output file *was* in the same order as your test.htm. Maybe the
problem is elsewhere?

I haven't followed this thread in its entirety, but if you're only
interested in a small set of events and doing relatively straightforward
parsing as in your example, you can also use regular subroutines to
parse the HTML. Something like this:

use strict;
use warnings;
use HTML::parser;

my $data;
{ local $/; $data = <DATA> }

my @to_print = qw[h2 p];
my @get_attr = qw[option];
my $current_tag = '';

my $parser = HTML::parser->new
(
report_tags => [ @to_print, @get_attr ],
default_h => [ \&default, 'text' ],
start_h => [ \&start_tag, 'tagname, attr, text' ],
)->parse( $data ) or die $!;

sub start_tag {
my ($tag, $attr, $text) = @_;
$current_tag = $tag;
( grep { $current_tag eq $_ } @to_print )
? print $text
: print $attr->{value};
}

sub default {
print shift if grep { $current_tag eq $_ } @to_print;
}
__DATA__
<html>
<head><title>test</title>
</head>
<body>

<h2>Finance </h2>

<p> description re Finance document</p>

<option
value="docs/gcse/student-activities/finance">Marketing</option>

<h2>Marketing</h2>

<p> description re marketing document </p>

<option
value="docs/gcse/student-activities/marketing">Marketing</option>

</body>
</html>
__END__

Worth noting:

1. You're only looking for three tags in your example, so its a good
idea to use the 'report_tags' key (method) in the new() constructor.
Basically it allows the parser to skip all tags not specified and makes
it more efficient. You get both start and end events, and text events in
between. The default handler above picks up the end/text events.

2. Just skimmed through your code, but if I understood correctly you
only want to (a) print 'p' and 'h2' tags and *text* inside of the tags,
or (b) get the 'value' attribute of 'option' tags. You should probably
add a test in the start_tag sub to make sure there is a 'value' attribute.

3. As in your existing code you need to reset $current_tag for each file
parsed.

4. Guilty of using grep in void context - this is a FAQ, but this is
just a short example.

HTH -keith
 
T

Tassilo v. Parseval

Also sprach Geoff Cox:
Tassilo,

OK - have tried to follow your suggestions in cutting down the code
etc.

Thank you. I can now run it without going through any hoops.
The code and the 3 files test.htm, db.txt and allphp2.php can be in
the same folder. The results will appear in results.htm. Hope this can
allow you to see why the oder in results.htm is not the same as that
in test.htm ...

After running your script, the results.htm I get is this:

<html><head><title>test</title>
</head><body>
<table width='100%' border='1'>
<h2>Finance </h2>
<p> description re Finance document</p>
<td valign='top'>
<a href="docs/gcse/student-activities/finance/finance-doc1.zip">Document1</a><br>
[...]
<a href="docs/gcse/student-activities/finance/finance-doc8.zip">Document8</a><br>
</td></tr>
<h2>Marketing</h2>
<p> description re marketing document </p>
<td valign='top'>
<a href="docs/gcse/student-activities/marketing/marketing-doc1.zip">Document1</a><br>
[...]
<a href="docs/gcse/student-activities/marketing/marketing-doc11.zip">Document11</a><br>
</td></tr>
</body></html>

Considering the input:
-------------- test.htm ----------------------

<html>
<head><title>test</title>
</head>
<body>

<h2>Finance </h2>

<p> description re Finance document</p>

<option
value="docs/gcse/student-activities/finance">Marketing</option>

<h2>Marketing</h2>

<p> description re marketing document </p>

<option
value="docs/gcse/student-activities/marketing">Marketing</option>

</body>
</html>

the order of the output looks correct to me. It's <h2>, <p> and finally
all the <a>'s. That's what I would expect when looking at your code.

If you have a different order, then it could have something to do with
buffering and output not happening in the order of the print()
statements. That would be a bit odd as each string you print is
terminated with a newline so buffering should not be an issue in this
case. But one never knows.

Can you try to enable autoflush on your output handle?

package main;
use IO::Handle;

open OUT, ">>results.htm" or die $!;
OUT->autoflush(1);

...

This will tell perl to do a flush after each print(). Does this help?

Tassilo
 
G

Geoff Cox

package main;
use IO::Handle;

open OUT, ">>results.htm" or die $!;
OUT->autoflush(1);

Tassilo,

Sorry for the delay in replying. Have tried above but still get the
wrong order!

Cheers

Geoff
 
G

Geoff Cox

I haven't followed this thread in its entirety, but if you're only
interested in a small set of events and doing relatively straightforward
parsing as in your example, you can also use regular subroutines to
parse the HTML. Something like this:

Keith,

Thanks for the thoughts - will give it a try!

Cheers

Geoff
 
T

Tassilo v. Parseval

Also sprach Geoff Cox:
Tassilo,

Sorry for the delay in replying. Have tried above but still get the
wrong order!

That is somewhat unfortunate as the problem in question does not occur
when I run your code. It's a bit tricky to address a bug that is not
reproducible.

Tassilo
 
R

Robin

package MyParser;
use base qw(HTML::parser);
use File::Find;

my $in_heading;
my $p;

my $dir = ("d:/a-keep9/short-nondb/oldshort2");

find sub {

my $name = $_;

open (OUT, ">>d:/a-keep9/short-nondb/short/members2/$name");

print OUT ("<html><head><title>test</title></head><body> \n");
print OUT ("<table width='100%' border='1'> \n");

sub start {

my ($self, $tagname, $attr, undef, $origtext) = @_;

if ($tagname eq 'h2') {
$in_heading = 1;
return;
}

if ($tagname eq 'p') {
$p = 1;
return;
}

if ($tagname eq 'option') {

choice($attr->{ value });

}

}

sub end {
my ($self, $tagname, $origtext) = @_;
if ($tagname eq 'h2') {
$in_heading = 0;
return;
}


if ($tagname eq 'p') {
$p = 0;
return;
}
}

sub text {
my ($self, $origtext) = @_;
print OUT ("<h2>$origtext</h2> \n") if $in_heading;
print OUT ("<p>$origtext</p> \n") if $p;

}

sub choice {
my ($path) = @_;

if ($path =~ /docs\/aslevel\/classroom-notes/) {
intro($path);
classroomnotes($path);
}

}

sub intro {

my ($pathhere) = @_;
open (INN, "d:/a-keep9/short-nondb/db/total-160404.txt");
my $lineintro;

while (defined ($lineintro = <INN>)) {
if ($lineintro =~ /$pathhere','(.*?)'\)\;/) {
print OUT ("<tr><td>$1 <p> </td>\n");
}
}
}



sub classroomnotes {

my ($pattern) = @_;

open (INNN, "d:/a-keep9/short-nondb/allphp/allphp2.php");

my $line = <INNN>;

while (<INNN>){
last if /$pattern/;
}
my ($curr, $next1, $next2, $next3) = <INNN>;
close (INNN);

if ($next3 =~ /\$i\<(\d+);/) {
my $nn = $1;
print OUT ("<td valign='top'> \n");
for ($c=1;$c<$nn;$c++) {
print OUT ('<a href="'. $pattern . "-doc" . $c . ".zip" . '">' .
"Document$c" . "</a><br>" . "\n");
}
print OUT ("</td></tr>\n");
}
}


print OUT ("</body></html> \n");

package main;
open (IN, $name);
undef $/;
my $html = <IN>;
my $parser = MyParser->new;
$parser->parse($html);

}, $dir;

hmm... it's hard to read your code... you might want to work on structuring
it a bit. But I'm not the one to talk about that, my code is so funky to
read.
-Robin
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,777
Messages
2,569,604
Members
45,209
Latest member
NelsonJax

Latest Threads

Top