R
robic0
Since so much was learned on the substitution method, thought
this might be a better approach.
This is just the starting framework. The rest will be filled in.
Turn off the debug output for full speed.
Un-wrap the regexp if it is, before using.
print <<EOM;
# -----------------------
# XML (Regex) SAX Parser
# Version .901 - 1/7/06
# Copyright 2005,2006
# by robic0-At-yahoo.com
# -----------------------
EOM
use strict;
use warnings;
open DATA, "config.html" or die "can't open config.html...";
my $gabage1 = join ('', <DATA>);
close DATA;
my ($cnt, $content, $show_pos, $debug) = (1, '', 1, 1);
# master
#/(?:<\?(.*?)\?>)|(?:<META(.*?)>)|(?:<!DOCTYPE(.*?)>)|(?:<!\[CDATA\[(.*?)\]\]>)|(?:<!--(.*?)-->)|(?:<(\/*[\:0-9a-zA-Z]+?[\s]*\/*)>)|(?:<([\:0-9a-zA-Z]+?)[\s]+((?:[\:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/*)>)|(.+?)/sg)
# 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8
8 9 9
while ($gabage1 =~
/(?:<\?(.*?)\?>)|(?:<META(.*?)>)|(?:<!DOCTYPE(.*?)>)|(?:<!\[CDATA\[(.*?)\]\]>)|(?:<!--(.*?)-->)|(?:<(\/*[\:0-9a-zA-Z]+?[\s]*\/*)>)|(?:<([\:0-9a-zA-Z]+?)[\s]+((?:[\:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/*)>)|(.+?)/sg)
# 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8
8 9 9
{
if (defined $9) { $content .= $9; next; }
print "-"x20,"\n" if ($debug);
if (length ($content)) {
print "9 $content\n" if ($debug);
$content = '';
}
if ($show_pos) {
my $rr = pos $gabage1;
print "$rr ";
}
print "1 VERSION: $1\n" if ($debug && defined $1);
print "2 META: $2\n" if ($debug && defined $2);
print "3 DOCTYPE: $3\n" if ($debug && defined $3);
print "4 CDATA: $4\n" if ($debug && defined $4);
print "3 COMMENT: $5\n" if ($debug && defined $5);
## <tag> or </tag> or <tag/>
print "6 TAG: $6\n" if ($debug && defined $6);
## <tag attrib/> or <tag attrib>
print "7,8 TAG: $7 Attr: $8\n" if ($debug && defined $7);
$cnt++;
}
__END__
this might be a better approach.
This is just the starting framework. The rest will be filled in.
Turn off the debug output for full speed.
Un-wrap the regexp if it is, before using.
print <<EOM;
# -----------------------
# XML (Regex) SAX Parser
# Version .901 - 1/7/06
# Copyright 2005,2006
# by robic0-At-yahoo.com
# -----------------------
EOM
use strict;
use warnings;
open DATA, "config.html" or die "can't open config.html...";
my $gabage1 = join ('', <DATA>);
close DATA;
my ($cnt, $content, $show_pos, $debug) = (1, '', 1, 1);
# master
#/(?:<\?(.*?)\?>)|(?:<META(.*?)>)|(?:<!DOCTYPE(.*?)>)|(?:<!\[CDATA\[(.*?)\]\]>)|(?:<!--(.*?)-->)|(?:<(\/*[\:0-9a-zA-Z]+?[\s]*\/*)>)|(?:<([\:0-9a-zA-Z]+?)[\s]+((?:[\:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/*)>)|(.+?)/sg)
# 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8
8 9 9
while ($gabage1 =~
/(?:<\?(.*?)\?>)|(?:<META(.*?)>)|(?:<!DOCTYPE(.*?)>)|(?:<!\[CDATA\[(.*?)\]\]>)|(?:<!--(.*?)-->)|(?:<(\/*[\:0-9a-zA-Z]+?[\s]*\/*)>)|(?:<([\:0-9a-zA-Z]+?)[\s]+((?:[\:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/*)>)|(.+?)/sg)
# 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8
8 9 9
{
if (defined $9) { $content .= $9; next; }
print "-"x20,"\n" if ($debug);
if (length ($content)) {
print "9 $content\n" if ($debug);
$content = '';
}
if ($show_pos) {
my $rr = pos $gabage1;
print "$rr ";
}
print "1 VERSION: $1\n" if ($debug && defined $1);
print "2 META: $2\n" if ($debug && defined $2);
print "3 DOCTYPE: $3\n" if ($debug && defined $3);
print "4 CDATA: $4\n" if ($debug && defined $4);
print "3 COMMENT: $5\n" if ($debug && defined $5);
## <tag> or </tag> or <tag/>
print "6 TAG: $6\n" if ($debug && defined $6);
## <tag attrib/> or <tag attrib>
print "7,8 TAG: $7 Attr: $8\n" if ($debug && defined $7);
$cnt++;
}
__END__