B
BirgitteRand
I don't know how to follow links in an array (@links) at the bottom of
this script. Can anyone help me?
/Birgitte
#!/usr/bin/perl
use strict;
use WWW::Mechanize;
use LWP::Simple;
use HTML::TokeParser;
use XML::RSS;
# Create the RSS object.
my $rss = XML::RSS->new( version => '2.0' );
# Prep the RSS.
$rss->channel(
title => "JP",
link => "http://jp.dk/seneste",
description => "JP");
my $starting_url = 'http://jp.dk/seneste/';
my $output_dir = "c:/temp/jp";
# Create a new instance of WWW::Mechanize
my $mechanize = WWW::Mechanize->new();
# Retrieve the page
$mechanize->get($starting_url);
my $html = $mechanize->content;
my $p = HTML::TokeParser->new( \$html );
#jump through tags until you get 'h1'
while( my $title = $p->get_tag( 'h1' )) {
last if $title->[1]->{class} eq 'h1';
}
# look through the tokens until you hit the end of 'h1'
my @links;
while ( my $token = $p->get_token ) {
last if $token->[0] eq 'E' && $token->[1] eq 'h1'; #i.e., a
div end tag
if ( $token->[0] eq 'S' && $token->[1] eq 'a' ) {
push @links, $token->[2]->{href} if $token->[2]->{href} =~ /\/udland
\/.*?article.*/;
}
}
# now follow the links
for my $link ( @links ) {
$mechanize->follow( $link );
my $html = $mechanize->content;
my $p = HTML::TokeParser->new( \$html );
while( my $article = $p->get_token( 'h1' )) {
if ( $article->[0] eq 'S' and $article->[1] eq 'h1' ) {
my $title = $p->get_trimmed_text( '/h1' );
$article = $p->get_tag('p');
$article = $p->get_tag('p');
my $date = $p->get_trimmed_text('/p');
print "$date\n$title\n\n";
}
}
}
this script. Can anyone help me?
/Birgitte
#!/usr/bin/perl
use strict;
use WWW::Mechanize;
use LWP::Simple;
use HTML::TokeParser;
use XML::RSS;
# Create the RSS object.
my $rss = XML::RSS->new( version => '2.0' );
# Prep the RSS.
$rss->channel(
title => "JP",
link => "http://jp.dk/seneste",
description => "JP");
my $starting_url = 'http://jp.dk/seneste/';
my $output_dir = "c:/temp/jp";
# Create a new instance of WWW::Mechanize
my $mechanize = WWW::Mechanize->new();
# Retrieve the page
$mechanize->get($starting_url);
my $html = $mechanize->content;
my $p = HTML::TokeParser->new( \$html );
#jump through tags until you get 'h1'
while( my $title = $p->get_tag( 'h1' )) {
last if $title->[1]->{class} eq 'h1';
}
# look through the tokens until you hit the end of 'h1'
my @links;
while ( my $token = $p->get_token ) {
last if $token->[0] eq 'E' && $token->[1] eq 'h1'; #i.e., a
div end tag
if ( $token->[0] eq 'S' && $token->[1] eq 'a' ) {
push @links, $token->[2]->{href} if $token->[2]->{href} =~ /\/udland
\/.*?article.*/;
}
}
# now follow the links
for my $link ( @links ) {
$mechanize->follow( $link );
my $html = $mechanize->content;
my $p = HTML::TokeParser->new( \$html );
while( my $article = $p->get_token( 'h1' )) {
if ( $article->[0] eq 'S' and $article->[1] eq 'h1' ) {
my $title = $p->get_trimmed_text( '/h1' );
$article = $p->get_tag('p');
$article = $p->get_tag('p');
my $date = $p->get_trimmed_text('/p');
print "$date\n$title\n\n";
}
}
}