Dear all,
I'm trying to help to extrace email from company's website.
Here's part of my test script.
$content = "<a href=\"mailto:
[email protected]\"><a class=\"hello\" href=
\"mailto:
[email protected]?title=hello\">";
@emails = ($content =~ /<a.*href="mailto
.*)>"/cgim);
foreach my $email (@emails)
{
print "email:" . $email . "\n";
}
But to my surprise, no result is printed.
I'm working on Debian squeeze.
My perl version is 5.10.0.
Would anyone here please help me out?
Thanks for any hints or advice in advance.
Below are better html/xml regular expressions to parse <tag attrib/>,
what your interrested in. And it will get all the 'mailto:'s.
Might as well do it right.
Test sample html and output below __DATA__ section
-sln
## Arxp.pl
##
## Simple html/xml regexp parser for just <tag attrib/>
## No entity conversions, no extras.
## Let me know if you want conversions or simple extra's
## -sln 4/19/09
##
my @UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
my @UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
my $Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
my $Nchar = "[-\\w:\\.".join ('',@UC_Nchar).join ('',@UC_Nstart)."]";
my $Name = "(?:$Nstrt$Nchar*)";
my $qRx = qr/<(?
?
$Name)(\s+(?
?
?:".*?")|(?:'.*?'))|(?:[^>]*?))+)\s*\/?)|--.*?--)>/s;
# <( ( 1 12 ( ( ( )|( ))|( )) 2 )| )>
my $qRxAttr = qr/\G\s+(?
?
$Name)\s*=\s*("|'|))|($Name))/;
my $qRxAttr_DL1 = qr/\G(?
[^'&<]*?)|([^'<]*?))'/;
my $qRxAttr_DL2 = qr/\G(?
[^"&<]*?)|([^"<]*?))"/;
my $qRxAttr_DL3 = qr/\G([^"'=<\s]+)/;
my $html = join '', <DATA>;
while ($html =~ /$qRx/g)
{
## <tag attrib/> or <tag attrib>
##
if (defined $1) # && lc($1) eq 'a'
{
my %result = ();
# get attributes
$result = _getAttrARRAY ($2, 0, \%result);
## do checks
if (length ($result->{'errstr'})) {
# missing or extra token, hard error
print "Error in tag attrib string here ->'$result->{'errstr'}'\n";
next;
}
## we will consider these acceptable html, not processed for this
# length ($result->{'dupattrs'})
# length ($result->{'badattrs'})
# length ($result->{'noquoteattrs'})
## process (scrape) attribute array for 'mailto:'
my %htmp = @{$result->{'attrsref'}};
while (my ($atr,$val) = each %htmp)
{
push @emails, $1 if ($val =~ /mailto
.+)/is);
}
}
}
print $_,"\n" for @emails;
# -------------------------------
sub _convertEntities { undef } # intentionally blank
sub _getAttrARRAY
{
my ($attrstr, $conv_ent, $hresult) = @_;
@{$hresult->{'attrsref'}} = ();
$hresult->{'badattrs'} = '';
$hresult->{'dupattrs'} = '';
$hresult->{'noquoteattrs'} = '';
$hresult->{'errstr'} = '';
my %hseen = ();
my $aref = $hresult->{'attrsref'};
my ($alt_attval, $attval, $rx, $ndx, $DL3);
# my $tmpstr = $attrstr;
my $match = 0;
while ($attrstr =~ /$qRxAttr/gc)
{
$match = 1;
if (defined $2)
{
$ndx = push @{$aref},$1;
$DL3 = 0;
if ($2 eq "'") {
$rx = \$qRxAttr_DL1;
}
elsif ($2 eq '"') {
$rx = \$qRxAttr_DL2;
} else {
# no quotes
$rx = \$qRxAttr_DL3;
$DL3 = 1;
}
if (++$hseen{$1} == 2) {
$hresult->{'dupattrs'} .= ", $1";
$hresult->{'dupattrs'} =~ s/^(?:, )+//;
}
if ($attrstr =~ /$$rx/gc) {
if (!$DL3)
{
## normal quoted value
if (defined $1) {
push @{$aref},$1;
next;
}
$attval = $2;
if ($conv_ent && defined ($alt_attval = _convertEntities (\$attval))) {
push @{$aref},$$alt_attval;
next;
}
push @{$aref},$attval;
next;
}
## bad attrib, value is not quoted
$attval = $1;
if ($conv_ent && defined ($alt_attval = _convertEntities (\$attval))) {
push @{$aref},$$alt_attval;
} else {
push @{$aref},$attval;
}
$hresult->{'noquoteattrs'} .= ", ".$aref->[$ndx-1];
$hresult->{'noquoteattrs'} =~ s/^(?:, )+//;
next;
}
## bad value, its either '<' or no ["'] closure
$hresult->{'badattrs'} .= ", ".$aref->[$ndx-1];
$hresult->{'badattrs'} =~ s/^(?:, )+//;
push @{$aref},'UNDEF_ATTRVAL';
# trim up to '<', otherwise its reported as
# improperly quoted or missing value
$attrstr = substr ($attrstr, pos($attrstr));
$attrstr =~ s/^[^<]+//;
} else {
## attrib with no attrib value
## (standalone atrribute only)
$ndx = push @{$aref},$3;
if (++$hseen{$3} == 2) {
$hresult->{'dupattrs'} .= ", $3";
$hresult->{'dupattrs'} =~ s/^(?:, )+//;
}
$hresult->{'badattrs'} .= ", ".$aref->[$ndx-1];
$hresult->{'badattrs'} =~ s/^(?:, )+//;
push @{$aref},'UNDEF_ATTRVAL';
next;
}
# bad, return that part of string which is in error
$hresult->{'errstr'} = $attrstr;
return $hresult;
}
pos($attrstr) = 0 if (!$match);
if (length($attrstr) > pos($attrstr)) {
$attrstr = substr ($attrstr, pos($attrstr));
$attrstr =~ s/^\s+//; $attrstr =~ s/\s+$//;
# bad, return that part of string which is in error
# print "-BAD-:$tmpstr\n";
$hresult->{'errstr'} = $attrstr if (length($attrstr));
}
return $hresult;
}
__DATA__
<-- Don't include <a href='mailto:
[email protected]'> me -->
<a href='mailto:
[email protected]'>
<a class="hello" href=
"mailto:
[email protected]?title='<hello>';">
<tag scrape_me = "mailto:
[email protected]"
/>
<oh nogood = "mailto:
[email protected]' >
Output:
(e-mail address removed)
(e-mail address removed)?title='<hello>';
(e-mail address removed)