: Sounds simple enough. I need to retrieve the source from a web page
: and then find a link in that web page that ends with a string which I
: have stored in a variable. Can someone please post or direct me to a
: sample of how to do this? Thanks!
Try this on for size:
% cat try
#! /usr/local/bin/perl
use strict;
use warnings;
use HTML:
arser;
use LWP::UserAgent;
use URI::URL;
use Data:
umper;
sub make_parser {
my $inside;
my %attr;
my $text;
my @links;
my $record = sub {
my $state = Dumper {
inside => $inside,
attr => \%attr,
text => $text,
};
my @cond = (
[ sub { $state }, "not inside" ],
[ sub { %attr }, "no attr" ],
[ sub { $attr{href} }, "no href" ],
);
my $ok = 1;
for (@cond) {
my($check,$msg) = @$_;
unless ($check->()) {
warn "$0: $msg:\n$state ";
$ok = 0;
}
}
push @links => [ $text || '<empty>', $attr{href} ] if $ok;
$inside = 0;
%attr = ();
$text = '';
};
my $start_h = sub {
my $tag = shift;
return unless $tag eq 'a';
if ($inside) {
warn "$0: already inside";
$record->();
}
my $attr = shift;
return unless $attr->{href};
%attr = %$attr;
$inside = 1;
};
my $text_h = sub {
return unless $inside;
$text .= shift;
};
my $end_h = sub {
my $tag = shift;
return unless $tag eq 'a';
return unless $inside;
$record->();
};
my $p = HTML:
arser->new(
api_version => 3,
start_h => [ $start_h, "tagname, attr" ],
text_h => [ $text_h, "dtext" ],
end_h => [ $end_h, "tagname" ],
);
($p, sub { @links });
}
sub usage () { "Usage: $0 search-pattern\n" }
## main
die usage unless @ARGV;
my $pat = shift;
my $lookfor = eval { qr/$pat/ };
die "$0: bad pattern: $pat" unless $lookfor;
my $url = "
http://www.cpan.org/";
my $ua = LWP::UserAgent->new;
my($p,$links) = make_parser;
# Request document and parse it as it arrives
my $res = $ua->request(
HTTP::Request->new(GET => $url),
sub { $p->parse($_[0]) }
);
my $base = $res->base;
for ($links->()) {
my($text,$href) = @$_;
next unless $text =~ /$lookfor$/;
my $url = url($href, $base)->abs;
$text =~ s/\s+/ /g;
print "$text:\n $url\n";
}
% ./try 's$'
Perl modules:
http://www.cpan.org/modules/index.html
Perl scripts:
http://www.cpan.org/scripts/index.html
Perl recent arrivals:
http://www.cpan.org/RECENT.html
CPAN sites:
http://www.cpan.org/SITES.html
CPAN sites:
http://mirrors.cpan.org/
CPAN modules, distributions, and authors:
http://search.cpan.org/
CPAN Frequently Asked Questions:
http://www.cpan.org/misc/cpan-faq.html
Perl Mailing Lists:
http://lists.cpan.org/
Perl Bookmarks:
http://bookmarks.cpan.org/
% ./try '('
./try: bad pattern: ( at ./try line 95.
Hope this helps,
Greg