D
David.Bramer
Hi
Thought I would share my code with you as it might help to fix my
problem a bit quicker...
I thought I would be able to some how use table extract to pull out
the links to each of the fund names at the URL in the variable
$MORNINGSTAR.
For instance "CAAM Funds Latin America Equities C Inc" has a link
http://www.morningstar.co.uk/UK/snapshot/snapshot.aspx?id=F0GBR05SXI
This is what I would like to extract.
Does anyone know how I can do this with Tableextract or Mechanize?
Your help would be really appreciated!
Thanks
David
#!/usr/local/bin/perl
use HTML::TableExtract;
use LWP::Simple;
use WWW::Mechanize;
## Morningstar launch page
$MORNINGSTAR="http://www.morningstar.co.uk/UK/ISAQuickrank/
default.aspx?tab=1&sortby=ReturnM60&lang=en-GB";
$NASDAQ_PRICE_DATA="nasdaq_daily.txt";
## Variables
my @nasdaqPrices; ## array containing nasdaq close prices
my $mech = WWW::Mechanize->new(); ## perl agent
my $content; ## webpage content
my $link_name;
my $nasdaqWeekly; ## Weekly nasdaq change
my $nasdaqDaily; ## Daily nasdaq change
## Obtain latest NASDAQ values
@nasdaqPrices = &nasdaqToArray($NASDAQ_PRICE_DATA);
$nasdaqWeekly = &nasdaqPerformanceWeekly(@nasdaqPrices);
print "Nasdaq w ".$nasdaqWeekly."\n";
$nasdaqDaily = &nasdaqPerformanceDaily(@nasdaqPrices);
print "Nasdaq d ".$nasdaqDaily."\n";
#print scalar(@nasdaqPrices)."\n";
#print "@nasdaqPrices";
##exit;
$mech->agent_alias("Windows IE 6");
$mech->get($MORNINGSTAR);
#$content = get($MORNINGSTAR);
## Find the total number of pages
$link_name = $mech->find_link( text => "last" );
$re='.*?'.'[-+]?\\d+'.'.*?'.'[-+]?\\d+'.'.*?'.'[-+]?\\d+'.'.*?'.'[-+]?
\\d+'.'.*?'.'([-+]?\\d+)';
if ($link_name->url() =~ m/$re/is)
{
$total_pages=$1;
}
## Print ever single page out.
for ($count=1;$count<=$total_pages;$count++)
{
## if first go then print the
if ($count == 1)
{
##&printTable($content, $nasdaqDaily, $nasdaqWeekly);
&printTable($mech, $nasdaqDaily, $nasdaqWeekly);
}
else
{
#&doPostBack($mech, 'ctl00$ctl00$MainContent$Layout_1MainContent
$AspNetPager1',$count);
#&printTable($mech->content(), $nasdaqDaily, $nasdaqWeekly);
&doPostBack($mech, 'ctl00$ctl00$MainContent$Layout_1MainContent
$AspNetPager1',$count);
&printTable($mech, $nasdaqDaily, $nasdaqWeekly);
}
}
sub printTable()
{
my $contents = shift;
my $dailychange = shift;
my $weeklychange = shift;
my $link;
my $webpage = $contents->content();
## Print out a table of values
my $te = HTML::TableExtract->new( headers => [
##qw(Fund\sName Risk Std\sDev YTD 1\sYr 3\sYr\nAnlsd 5\sYr
10\sYr)
qw(Fund\sName Latest\nPrice 1\sDay 1\sWeek 1\sMonth 3\sMonth
6\sMonth Date)
], );
$te->parse($webpage);
##$te->parse($content(
foreach $ts ($te->tables)
{
foreach ($ts->rows)
{
($fund_name, $latest_price, $p1_day, $p1_week, $p1_month,
$p3_month, $p6_month, $pdate) = @$_;
if (($p1_day > $dailychange) && ($p1_day > 0))
{
print $fund_name."\t\t\t\t\t\t\t".$p1_day."\t".$dailychange."\n";
## find the link
$link = $contents->find_link( text => $fund_name );
##print $link."\n\n";
}
}
}
}
sub doPostBack()
{
my $agent = shift; ## WWW::Mechanize agent-object
my $target = shift; ## first argument in the __doPostBack() call
my $arg = shift;
$agent->form_name("aspnetForm");
$agent->field('__EVENTTARGET', $target);
$agent->field('__EVENTARGUMENT', $arg);
$agent->submit();
}
sub nasdaqToArray()
{
my @nasdaqPriceList;
my $nasdaq_price_data = shift;
open (NASDAQ, "$nasdaq_price_data") || die "File not found\n";
while ($line =<NASDAQ>)
{
chomp $line; # removes the carriage return
push @nasdaqPriceList, split(/, / , $line); # breaks the line up
into fields
}
close NASDAQ;
return @nasdaqPriceList;
}
sub nasdaqPerformanceDaily()
{
my @nasdaqPriceList = @_;
my $nasdaqDailyPC;
my $old = @nasdaqPriceList[-2];
my $new = @nasdaqPriceList[-1];
$nasdaqDailyPC = sprintf("%.4f", ((($new - $old) / $old) * 100));
return $nasdaqDailyPC;
}
sub nasdaqPerformanceWeekly()
{
my @nasdaqPriceList = @_;
my $nasdaqWeeklyPC;
my $old = @nasdaqPriceList[-5];
my $new = @nasdaqPriceList[-1];
$nasdaqWeeklyPC = sprintf("%.4f", ((($new - $old) / $old) * 100));
return $nasdaqWeeklyPC;
}
Thought I would share my code with you as it might help to fix my
problem a bit quicker...
I thought I would be able to some how use table extract to pull out
the links to each of the fund names at the URL in the variable
$MORNINGSTAR.
For instance "CAAM Funds Latin America Equities C Inc" has a link
http://www.morningstar.co.uk/UK/snapshot/snapshot.aspx?id=F0GBR05SXI
This is what I would like to extract.
Does anyone know how I can do this with Tableextract or Mechanize?
Your help would be really appreciated!
Thanks
David
#!/usr/local/bin/perl
use HTML::TableExtract;
use LWP::Simple;
use WWW::Mechanize;
## Morningstar launch page
$MORNINGSTAR="http://www.morningstar.co.uk/UK/ISAQuickrank/
default.aspx?tab=1&sortby=ReturnM60&lang=en-GB";
$NASDAQ_PRICE_DATA="nasdaq_daily.txt";
## Variables
my @nasdaqPrices; ## array containing nasdaq close prices
my $mech = WWW::Mechanize->new(); ## perl agent
my $content; ## webpage content
my $link_name;
my $nasdaqWeekly; ## Weekly nasdaq change
my $nasdaqDaily; ## Daily nasdaq change
## Obtain latest NASDAQ values
@nasdaqPrices = &nasdaqToArray($NASDAQ_PRICE_DATA);
$nasdaqWeekly = &nasdaqPerformanceWeekly(@nasdaqPrices);
print "Nasdaq w ".$nasdaqWeekly."\n";
$nasdaqDaily = &nasdaqPerformanceDaily(@nasdaqPrices);
print "Nasdaq d ".$nasdaqDaily."\n";
#print scalar(@nasdaqPrices)."\n";
#print "@nasdaqPrices";
##exit;
$mech->agent_alias("Windows IE 6");
$mech->get($MORNINGSTAR);
#$content = get($MORNINGSTAR);
## Find the total number of pages
$link_name = $mech->find_link( text => "last" );
$re='.*?'.'[-+]?\\d+'.'.*?'.'[-+]?\\d+'.'.*?'.'[-+]?\\d+'.'.*?'.'[-+]?
\\d+'.'.*?'.'([-+]?\\d+)';
if ($link_name->url() =~ m/$re/is)
{
$total_pages=$1;
}
## Print ever single page out.
for ($count=1;$count<=$total_pages;$count++)
{
## if first go then print the
if ($count == 1)
{
##&printTable($content, $nasdaqDaily, $nasdaqWeekly);
&printTable($mech, $nasdaqDaily, $nasdaqWeekly);
}
else
{
#&doPostBack($mech, 'ctl00$ctl00$MainContent$Layout_1MainContent
$AspNetPager1',$count);
#&printTable($mech->content(), $nasdaqDaily, $nasdaqWeekly);
&doPostBack($mech, 'ctl00$ctl00$MainContent$Layout_1MainContent
$AspNetPager1',$count);
&printTable($mech, $nasdaqDaily, $nasdaqWeekly);
}
}
sub printTable()
{
my $contents = shift;
my $dailychange = shift;
my $weeklychange = shift;
my $link;
my $webpage = $contents->content();
## Print out a table of values
my $te = HTML::TableExtract->new( headers => [
##qw(Fund\sName Risk Std\sDev YTD 1\sYr 3\sYr\nAnlsd 5\sYr
10\sYr)
qw(Fund\sName Latest\nPrice 1\sDay 1\sWeek 1\sMonth 3\sMonth
6\sMonth Date)
], );
$te->parse($webpage);
##$te->parse($content(
foreach $ts ($te->tables)
{
foreach ($ts->rows)
{
($fund_name, $latest_price, $p1_day, $p1_week, $p1_month,
$p3_month, $p6_month, $pdate) = @$_;
if (($p1_day > $dailychange) && ($p1_day > 0))
{
print $fund_name."\t\t\t\t\t\t\t".$p1_day."\t".$dailychange."\n";
## find the link
$link = $contents->find_link( text => $fund_name );
##print $link."\n\n";
}
}
}
}
sub doPostBack()
{
my $agent = shift; ## WWW::Mechanize agent-object
my $target = shift; ## first argument in the __doPostBack() call
my $arg = shift;
$agent->form_name("aspnetForm");
$agent->field('__EVENTTARGET', $target);
$agent->field('__EVENTARGUMENT', $arg);
$agent->submit();
}
sub nasdaqToArray()
{
my @nasdaqPriceList;
my $nasdaq_price_data = shift;
open (NASDAQ, "$nasdaq_price_data") || die "File not found\n";
while ($line =<NASDAQ>)
{
chomp $line; # removes the carriage return
push @nasdaqPriceList, split(/, / , $line); # breaks the line up
into fields
}
close NASDAQ;
return @nasdaqPriceList;
}
sub nasdaqPerformanceDaily()
{
my @nasdaqPriceList = @_;
my $nasdaqDailyPC;
my $old = @nasdaqPriceList[-2];
my $new = @nasdaqPriceList[-1];
$nasdaqDailyPC = sprintf("%.4f", ((($new - $old) / $old) * 100));
return $nasdaqDailyPC;
}
sub nasdaqPerformanceWeekly()
{
my @nasdaqPriceList = @_;
my $nasdaqWeeklyPC;
my $old = @nasdaqPriceList[-5];
my $new = @nasdaqPriceList[-1];
$nasdaqWeeklyPC = sprintf("%.4f", ((($new - $old) / $old) * 100));
return $nasdaqWeeklyPC;
}