Sorry I am a little overwhelmed with the coding so far (I'm not very
good at perl). I have what you have posted, but my problem is that I
would like to filter that content... like lets say I searched a site
that had 15 news links and 3 of them said "Hello" in the title. I
would want to extract only the links that said hello in the title.
This might help you. Requires Perl 5.10 or better.
-sln
Output:
Specific Tag/Attr Titles found --
Hello:
"
http://helloA.com"
"helloB.com"
no_title:
"/info/twitter.aspx"
All Tag/Attr found --
a-href:
"
http://helloA.com"
"/info/twitter.aspx"
"helloB.com"
link-href:
"/includes/css/main.css"
Code:
# -------------------------------------------
# rx_html_href.pl
# -sln, 3/20/2010
#
# Util to extract some attribute/val's from
# html/xml
# -------------------------------------------
use strict;
use warnings;
my ($Name,$Rxmarkup);
InitName();
my $rxopen = "(?: $Name )"; # Open tag with 'href' attrib, cannot be empty alternation
#my $rxopen = "(?: a )"; # Open tag with 'href' attrib, cannot have an empty alternation
my $rxattr = "(?: href )"; # Attribute we seek, cannot have an empty alternation
my $rxclose = "(?: a )"; # Close tag to match with content, cannot have an empty alternation
my $rxtitle = "(?: Hello | )"; # Content Title, can be empty alternation
my %hTitles; # hash of titles => attribute values matching tag open, title, and tag close
my %hHrefs; # hash of tag => attribute values matching tag open expression, not necessaryily titles
InitRegex();
##
# open my $fh, '<', 'C:/temp/XML/tennis1.html' or
# die "can't open file for input: $!";
# my $html = join '', <$fh>;
# close $fh;
my $html = join '', <DATA>;
##
ParseHref(\$html);
##
print "\nSpecific Tag/Attr Titles found --\n";
for my $key (keys %hTitles) {
print " $key:\n";
for my $val (@{$hTitles{$key}}) {
print " $val\n";
}
}
print "\nAll Tag/Attr found -- \n";
for my $key (keys %hHrefs) {
print " $key:\n";
for my $val (@{$hHrefs{$key}}) {
print " $val\n";
}
}
exit (0);
##
sub ParseHref
{
my ($markup) = @_;
my (
$url,
$title,
$content,
$tfound,
$lcbpos,
$last_content_pos,
$begin_pos
) = ('','','',0,0,0,0);
## parse loop
while ($$markup =~ /$Rxmarkup/g)
{
## handle content buffer
if (defined $+{C1}) {
## speed it up
$content .= $+{C1};
if (length $+{C2})
{
if ($lcbpos == pos($$markup)) {
$content .= $+{C2};
} else {
$lcbpos = pos($$markup);
pos($$markup) = $lcbpos - 1;
}
}
$last_content_pos = pos($$markup);
next;
}
## content here ... take it off
if (length $content)
{
$begin_pos = $last_content_pos;
## check '<'
if ($content =~ /</) {
## markup in content
#print "Markup '<' in content, da stuff is crap!\n";
}
if ($content =~ /($rxtitle)/x && length $url) {
$tfound = 1;
$title = $1;
$title =~ s/^\s*//;
$title =~ s/\s*$//;
$title = 'no_title' if !length($title);
}
$content = '';
}
## markup here ... take it off
if (defined $+{OPEN}) {
push @{$hHrefs{$+{OPEN}.'-'.$+{ATTR}}}, $+{VAL} ;
$url = $+{VAL};
$tfound = 0;
$title = '';
}
elsif (defined $+{CLOSE}) {
if (length $url && $tfound) {
push @{$hTitles{$title}}, $url;
}
$url = '';
$tfound = 0;
$title = '';
}
} ## end parse loop
## check for leftover content
if (length $content)
{
## check '<'
if ($content =~ /</) {
## markup in content
#print "Markup '<' in left over content, da stuff is crap!\n";
}
}
}
sub InitName
{
my @UC_Nstart = (
"\\x{C0}-\\x{D6}",
"\\x{D8}-\\x{F6}",
"\\x{F8}-\\x{2FF}",
"\\x{370}-\\x{37D}",
"\\x{37F}-\\x{1FFF}",
"\\x{200C}-\\x{200D}",
"\\x{2070}-\\x{218F}",
"\\x{2C00}-\\x{2FEF}",
"\\x{3001}-\\x{D7FF}",
"\\x{F900}-\\x{FDCF}",
"\\x{FDF0}-\\x{FFFD}",
"\\x{10000}-\\x{EFFFF}",
);
my @UC_Nchar = (
"\\x{B7}",
"\\x{0300}-\\x{036F}",
"\\x{203F}-\\x{2040}",
);
my $Nstrt = "[A-Za-z_:".join ('',@UC_Nstart)."]";
my $Nchar = "[\\w:.".join ('',@UC_Nchar).join ('',@UC_Nstart)."-]";
$Name = "(?:$Nstrt$Nchar*)";
}
sub InitRegex
{
$Rxmarkup = qr/
(?:
<
(?:
# Specific markup
(?: (?<OPEN> $rxopen ) \s+[^>]*? (?<=\s) (?<ATTR> $rxattr) \s*=\s* (?<VAL> ".+?"|'.+?')[^>]*? \s* \/?) # OPEN, ATTR, VAL
|(?: (?<CLOSE> \/$rxclose ) \s* ) # CLOSE
# Ordinary exclusionary markup
|(?: \/* $Name \s* \/*)
|(?: $Name (?:\s+(?:".*?"|'.*?'|[^>]*?)+) \s* \/?)
|(?: \?.*?\?)
|(?:
!
(?: # markup types that have '!'
(?: DOCTYPE.*?)
|(?: \[CDATA\[.*?\]\])
|(?: --.*?--)
|(?: \[[A-Z][A-Z\ ]*\[.*?\]\]) # who knows?
|(?: ATTLIST.*?)
|(?: ENTITY.*?)
|(?: ELEMENT.*?)
# add more if necessary
)
)
))
# This alternation handles content
| (?<C1> [^<]*) (?<C2> <?) # C1, C2
/xs;
}
__DATA__
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 # $ \ Transitional//EN">
<HTML><HEAD>
<META http-equiv=3DContent-Type content=3D"text/html; =
charset=3Diso-8859-1">
<META content=3D "MSHTML 6.00.2900.3395" name=3DGENERATOR>
<STYLE></STYLE>
<test name = " thi<s # $ \ is a " test>
</HEAD>
<BODY bgColor=3D#ffffff>
should fix these: # $ \
but not these: ¯
fix some here: &&%#$ &as; &&#a0
<a href="
http://helloA.com">Hello</a>
<IMG SRC = "foo.gif" ALT = "A > B">
<IMG SRC = "foo.gif"
ALT = "A > # $ \ B">
<!-- <A comment # $ \ > -->
<NN & a # $ \>
<AA & # $ \>
<# Just data #>
<![INCLUDE CDATA [ >>>>>\\ # $ \ >>>>>>> ]]>
<!-- This section commented out.
<B>You can't # $ \ see me!</B>
-->
<link rel="stylesheet" type="text/css" href="/includes/css/main.css">
at root # $ \ > # $ \ level
<a href="/info/twitter.aspx" target="_top">
<img src="/images/icons/icon_twitter.gif" border="0" align="absmiddle">
</a>
<html><body>
<p>Hello
Kitty</p>
<a
href
=
"helloB.com"
<!--
There is no Hello here
-->
</body></html>