My Regexp XML Parser -> Structured Perl Data, Cut & Paste Version, No Module's (Vol I)

R

robic0

On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:

Final edition .904
This is the last, it will not be continued. Further processing could
have been done for all tag entities and alot of specialized handling.
However due to the recent awareness of the latencies involved in regexp
(re)searches, the "inner" substitution method is not viable in this context.

Thats all I will say on this. Content is not examined for xml reserved
characters (ie: &amp, etc..), although its easily added. Kind of too bad,
this method has alot of potential.

Anyway, for now its just an exercise in xml/html form and structure.
This status could change in the future as I do a more detailed post-mortum.
Should it change significantly, you won't hear about it in the forums.
I don't anticipate that being the case, but who knows.

-robic0-

Changes:
- Added to regexp, white space to account for <tag />


print <<EOM;

# -----------------------
# XML Regex Parser
# Version .904 - 12/31/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
EOM

use strict;
use warnings;
use Data::Dumper;

open DATA, "your.html" or die "can't open your.html...";
my $gabage1 = join ('', <DATA>);
close DATA;


print "here\n";
my @xml_strings = ($gabage1);

my $alt_debug = 0;
my $VERSION = .904;
my $debug = 0;
my $rmv_white_space = 0;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 0;
my $KeepContentOrder = 0;

## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## -------------------------/-------------------------------
my (@S_dlim, @E_dlim);
if ($debug) {
@S_dlim = ('\[' , '['); # use these for debug
@E_dlim = ('\]' , ']');
} else {
@S_dlim = (chr(140) , chr(140)); # use these for production
@E_dlim = (chr(141) , chr(141));
}

## -- Process xml data --
##
for (@xml_strings)
{
#print "\n",'*'x30,"\nXML string:\n",'-'x15,"\n$_\n\nOutput:\n",'-'x15,"\n\n" if ($debug);
if ($alt_debug) {
ProcessAltDebugInfo ($_) ;
print "\n";
}
my $ROOT = {}; # container
my %cdata_elements = ();
my ($last_cnt, $cnt, $i, $attr_error) = (-1, 1, 0, 0);

## Comment/CDATA block ==================================
#### To be done first -
# -- Questionable Comments --
while (s/(<!--(.*?)-->)/$S_dlim[1]$cnt$E_dlim[1]/) {
#print "$cnt = Questionable comment: $1\n" if ($debug);
$ROOT->{$cnt} = $1;
$cnt++;
}
#### To be done second -
# -- Real CDATA --
while (s/<!\[CDATA\[(.*?)\]\]>/$S_dlim[1]$cnt$E_dlim[1]/s)
{
# reconstitute cdata contents
my $cdata_contents = $1;
my $str = '';
while ($cdata_contents =~ s/([^$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//) {
if (defined $1) {
$str .= $1;
} elsif (defined $2 && exists $ROOT->{$2}) {
$str .= $ROOT->{$2};
delete $ROOT->{$2};
} else {} # shouldn't get here
}
print "$cnt CDATA = $str\n" if ($debug);
$ROOT->{$cnt} = $str;
$cdata_elements{$cnt} = '';
$cnt++;
}
#### To be done third -
# -- Real Comments are left --
foreach my $key (sort {$a <=> $b} keys %{$ROOT}) {
if (!exists $cdata_elements{$key}) {
$ROOT->{$key} =~ s/^<!--(.*?)-->$/$1/s;
print "$key Comment = $1\n" if ($debug);
if ($KeepComments) {
$ROOT->{$key} = { comment => $1 };
} else {delete $ROOT->{$key};}
}
}
## End Comment/CDATA block ==============================

#### Non-tag markups go here -
####

# -- Versioning -- <?XML-Version ?> - Placeholder, voided
while (s/<\?([^<>]*)\?>//) {
#while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <? ?> = $1\n" if ($debug);
$ROOT->{$cnt} = { 'XMLV' => $1 };
$cnt++;
}
# -- DOCTYPE -- <!DOCTYPE info> - Placeholder, voided
while (s/<!DOCTYPE([^<>]*)>//) {
#while (s/<!DOCTYPE([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt DOCTYPE = $1\n" if ($debug);
$ROOT->{$cnt} = { 'DOCTYPE' => $1 };
$cnt++;
}
# -- META -- <META info> - Placeholder, voided
while (s/<META([^<>]*)>//) {
#while (s/<META([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt META = $1\n" if ($debug);
$ROOT->{$cnt} = { 'META' => $1 };
$cnt++;
}
#### White space removal before tags ? .. TBD -
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}

#### Tags here - should only need 2 iterations max
my $finished = 0;

while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;

## <Tag/> , no content
while (s/<([0-9a-zA-Z]+)[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = \n" if ($debug);
$ROOT->{$cnt} = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
## <Tag> Content </Tag>
while (s/<([0-9a-zA-Z]+)>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2, $ROOT, \%cdata_elements);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
} else {
if (exists $hcontent->{'content'}) {
my ($key);
if (!$ForceArray ) {
if (ref($hcontent->{'content'}) eq "ARRAY" && scalar(@{$hcontent->{'content'}}) == 1) {
$unknown = ${$hcontent->{'content'}}[0];
}
else {$unknown = $hcontent->{'content'}; }
}
}
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT->{$cnt} = { $1 => $unknown };
$cnt++;
}
last if ($attr_error);
## <Tag Attributes> Content </Tag>
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"][\s]*)+[\s]*>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <$1> = attr: $2, content: $3\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
if (length($3) > 0) {
my $hcontent = getContentHash($3, $ROOT, \%cdata_elements);
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each (%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT->{$cnt} = { $1 => $hattrib };
$cnt++;
}
last if ($attr_error);
if ($last_cnt != $cnt) {
$i++; print "** End pass $i\n" if ($debug);
} else {
last if ($finished);
## Encapsulate the xml with a "root"
$_ = "<root>$_</root>";
$last_cnt--;
$finished = 1;
}
}
next if ($attr_error);

if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag closure:\n$_";
} else {
print "** Itterations = $i\n".
"** Debug = $debug\n".
"** Rmv white space = $rmv_white_space\n".
"** ForceArray = $ForceArray\n".
"** KeepRoot = $KeepRoot\n".
"** KeepComments = $KeepComments\n".
"** KeepContentOrder = $KeepContentOrder\n";
#print Dumper($ROOT);
print "The remaining string is:\n$_\n\n" if ($debug);

## Strip off the outer element (our root) to
## examine the contents for errors.
## ---------------------------------------
my $outer_element = $cnt-1;
if (exists $ROOT->{$outer_element})
{
my $hroot = $ROOT->{$outer_element};
my ($key,$val) = each (%{$hroot});
my $htodump = $val;

# check for errors in root
if (ref($htodump) ne "HASH" || (!$KeepContentOrder && exists $htodump->{'content'})) {
my $msg = 'Error';
$msg = 'Warning' if ($KeepContentOrder);
print "$msg, bare content at root level ..\n";
} else {
my $dmp_keys = keys (%{$htodump});

if ($dmp_keys > 1) {
print "Warning, multiple elements at root level ..\n";
} else {
($key,$val) = each (%{$htodump});
my $val_type = ref($val);

if ($dmp_keys == 0 || (exists $htodump->{'comment'})) {
print "Warning, no elements at root level ..\n";
}
if ($dmp_keys == 1) {
if ($val_type eq "HASH") {
$htodump = $val if (!$KeepRoot);
}
elsif ($val_type eq "ARRAY") {
$htodump = $val if (!$KeepRoot && $KeepContentOrder);
if (!$ForceArray || scalar(@{$val}) > 1) {
print "Warning, multiple elements at root level ..\n";
}
}
}
}
}
print "\n";
my $tmp = undef;
if (ref($htodump) eq "HASH") {
$tmp = {};
%{$tmp} = %{$htodump};
} elsif (ref($htodump) eq "ARRAY") {
$tmp = [];
@{$tmp} = @{$htodump};
} else {
print "Not a hash or array!\n";
}
print Dumper($tmp) if (defined $tmp);
} else {
print "nothing to output!\n";
}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attrstr = shift;
my $ahref = {};
return $ahref unless (defined $attrstr);
while ($attrstr =~ s/[\s]*([0-9a-zA-Z]+)[\s]*=[\s]*("|')([^=]*)\2[\s]*//) {
$ahref->{$1} = $3;
}
if ($attrstr=~/=/) {
$attrstr =~ s/^\s+//s;
$attrstr =~ s/\s+$//s;
return $attrstr
}
return $ahref;
}
##
sub getContentHash
{
my ($contstr,$hStore,$hcdata_elements) = @_;
my $ahref = {};
return $ahref unless (defined $contstr && defined $hStore && defined $hcdata_elements);
my @ary = ();
my $append_flag = 0;

while ($contstr =~ s/^([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//s)
{
## -- $1 is text contents --
if (defined $1) {
my $tmp1 = $1;
# if flagged, append it to $ary[last]
if ($append_flag && scalar(@ary) > 0) {
my $size = scalar(@ary);
$ary[$size-1] .= $tmp1;
} else {
push (@ary, $1);
}
$append_flag = 0;
}
## -- $2 is substitution index --
elsif (defined $2) {
## Exist check (Comments stripped?),
# turn on append flag.
# -----------------------------------
if (!exists $hStore->{$2}) {
$append_flag = 1;
next;
}
## CDATA check, append it to $ary[last]
# and turn on append flag.
# ---------------------------------------
if (exists $hcdata_elements->{$2}) {
my $size = scalar(@ary);
if ($size > 0) {
$ary[$size-1] .= $hStore->{$2};
} else {push (@ary, $hStore->{$2});}
$append_flag = 1;
next;
}
$append_flag = 0;

## Substitution of in-line content,
# push it to @ary
# ----------------------------------
if ($KeepContentOrder) {
push (@ary, $hStore->{$2});
next;
}
## Substitution of same level here (normal),
# just store it to $ahref
# -----------------------------------------
my ($key,$val) = each (%{$hStore->{$2}});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
else {} # shouldn't get here
}
# Store contents, strip out
# pure whitespace text elements
my $hary = [];
for (@ary) {
next if (/^\s+$/s);
push (@{$hary}, $_);
}
if (scalar(@{$hary}) > 0) {
$ahref->{'content'} = $hary;
}
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 && ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}

sub ProcessAltDebugInfo
{
}
 
M

Matt Garrish

On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:

Final edition .904
This is the last, it will not be continued. Further processing could
have been done for all tag entities and alot of specialized handling.
However due to the recent awareness of the latencies involved in regexp
(re)searches, the "inner" substitution method is not viable in this
context.

Thats all I will say on this. Content is not examined for xml reserved
characters (ie: &amp, etc..), although its easily added. Kind of too bad,
this method has alot of potential.

Well, at least you had the good sense to abandon the sinking ship...

Matt
 
R

robic0

On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:

Post-Mortum conclusions:

I have cut out 3 test examples on the post-mortum.
1. Substitution data store is a anonymous hash.
2. Data store is bypassed, just null regexp substitution is done.
3. Substitution data store is a named array.

I'm posting #3 here for reference and the final code v .905
A note of interest between #1 and #3 on a test of 15,000
substitutions (large html file), there was no discernable time
difference using hash lookups as opposed to array indexes.
This shows that hash lookups involve just a pointer + offset
math as is done with array index addressing.

The time killer here was #2, not #1 or #3.
This makes sense when doing inner substitution since the search
position has to be reset at every pass.

This code might be a usefull to debug the inner data structure
of xml/html beyond just validation (by itself). Since at the
time of failure or possibly no failure but bad structure, the inner
structures grown up until then are retained for post-mortum.
This would be in contrast to an outter to inner parse.
In that case, since the data structure is build from the outside in,
there wouldn't be much of a starting point for analysis.

Other than that and from what was mentioned about the #2 time killer,
about the only use given the limitations here would be as a debug
tool. The code is readily alterred to filter needed structures.

Let me know if you have any questions.
-robic0-

print <<EOM;

# -----------------------
# XML Regex Parser
# Version .905 - 12/31/05
# Copyright 2005,
# by robic0-At-yahoo.com
# -----------------------
EOM

use strict;
use warnings;
use Data::Dumper;

open DATA, "config.html" or die "can't open config.html...";
my $gabage1 = join ('', <DATA>);
close DATA;


my @xml_strings = ($gabage1);

my $alt_debug = 0;
my $VERSION = .905;
my $debug = 0;
my $rmv_white_space = 0;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 1;
my $KeepContentOrder = 1;

## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## -------------------------/-------------------------------
my (@S_dlim, @E_dlim);
if ($debug) {
@S_dlim = ('\[' , '['); # use these for debug
@E_dlim = ('\]' , ']');
} else {
@S_dlim = (chr(140) , chr(140)); # use these for production
@E_dlim = (chr(141) , chr(141));
}

## -- Process xml data --
##
for (@xml_strings)
{
#print "\n",'*'x30,"\nXML string:\n",'-'x15,"\n$_\n\nOutput:\n",'-'x15,"\n\n" if ($debug);
if ($alt_debug) {
ProcessAltDebugInfo ($_) ;
print "\n";
}
my (@ROOT, %cdata_elements);
@ROOT = (); # container
%cdata_elements = ();

my ($last_cnt, $cnt, $i, $attr_error) = (-1, 1, 0, 0);

## Comment/CDATA block ==================================
#### To be done first -
# -- Questionable Comments --
while (s/(<!--(.*?)-->)/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt = Questionable comment: $1\n" if ($debug);
$ROOT[$cnt] = $1;
$cnt++;
}
#### To be done second -
# -- Real CDATA --
while (s/<!\[CDATA\[(.*?)\]\]>/$S_dlim[1]$cnt$E_dlim[1]/s)
{
# reconstitute cdata contents
my $cdata_contents = $1;
my $str = '';
while ($cdata_contents =~ s/([^$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//) {
if (defined $1) {
$str .= $1;
} elsif (defined $2 && defined $ROOT[$2]) {
$str .= $ROOT[$2];
delete $ROOT[$2];
} else {} # shouldn't get here
}
print "$cnt CDATA = $str\n" if ($debug);
$ROOT[$cnt] = $str;
$cdata_elements{$cnt} = '';
$cnt++;
}
#### To be done third -
# -- Real Comments are left --
for (my $ndx = 1; $ndx < @ROOT; $ndx++) {
if (!exists $cdata_elements{$ndx}) {
$ROOT[$ndx] =~ s/^<!--(.*?)-->$/$1/s;
print "$ndx Comment = $1\n" if ($debug);
if ($KeepComments) {
$ROOT[$ndx] = { comment => $1 };
} else {delete $ROOT[$ndx];}
}
}
## End Comment/CDATA block ==============================

#### Non-tag markups go here -
####

# -- Versioning -- <?XML-Version ?> - Placeholder, voided
while (s/<\?([^<>]*)\?>//) {
#while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <? ?> = $1\n" if ($debug);
$ROOT[$cnt] = { 'XMLV' => $1 };
$cnt++;
}
# -- DOCTYPE -- <!DOCTYPE info> - Placeholder, voided
while (s/<!DOCTYPE([^<>]*)>//) {
#while (s/<!DOCTYPE([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt DOCTYPE = $1\n" if ($debug);
$ROOT[$cnt] = { 'DOCTYPE' => $1 };
$cnt++;
}
# -- META -- <META info> - Placeholder, voided
while (s/<META([^<>]*)>//) {
#while (s/<META([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt META = $1\n" if ($debug);
$ROOT[$cnt] = { 'META' => $1 };
$cnt++;
}
#### White space removal before tags ? .. TBD -
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}
#### Tags here - should only need 2 iterations max
my $finished = 0;

while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;

## <Tag/> , no content
while (s/<([0-9a-zA-Z]+)[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = \n" if ($debug);
$ROOT[$cnt] = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = attr: $2\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
$ROOT[$cnt] = { $1 => $hattrib };
$cnt++;
}
## <Tag> Content </Tag>
while (s/<([0-9a-zA-Z]+)>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2, \@ROOT, \%cdata_elements);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
} else {
if (exists $hcontent->{'content'}) {
my ($key);
if (!$ForceArray ) {
if (ref($hcontent->{'content'}) eq "ARRAY" && scalar(@{$hcontent->{'content'}}) == 1) {
$unknown = ${$hcontent->{'content'}}[0];
}
else {$unknown = $hcontent->{'content'}; }
}
}
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT[$cnt] = { $1 => $unknown };
$cnt++;
}
last if ($attr_error);
## <Tag Attributes> Content </Tag>
while (s/<([0-9a-zA-Z]+)([\s]+[0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"][\s]*)+[\s]*>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = attr: $2, content: $3\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
if (length($3) > 0) {
my $hcontent = getContentHash($3, \@ROOT, \%cdata_elements);
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each (%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT[$cnt] = { $1 => $hattrib };
$cnt++;
}
last if ($attr_error);
if ($last_cnt != $cnt) {
$i++; print "** End pass $i\n" if ($debug);
} else {
last if ($finished);
## Encapsulate the xml with a "root"
$_ = "<root>$_</root>";
$last_cnt--;
$finished = 1;
}
}
next if ($attr_error);

if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag closure:\n$_";
} else {
print "** Itterations = $i\n".
"** Debug = $debug\n".
"** Rmv white space = $rmv_white_space\n".
"** ForceArray = $ForceArray\n".
"** KeepRoot = $KeepRoot\n".
"** KeepComments = $KeepComments\n".
"** KeepContentOrder = $KeepContentOrder\n";
#print Dumper($ROOT);
print "The remaining string is:\n$_\n\n" if ($debug);

## Strip off the outer element (our root) to
## examine the contents for errors.
## ---------------------------------------
my $outer_element = $cnt-1;
if (defined $ROOT[$outer_element])
{
my $hroot = $ROOT[$outer_element];
my ($key,$val) = each (%{$hroot});
my $htodump = $val;

# check for errors in root
if (ref($htodump) ne "HASH" || (!$KeepContentOrder && exists $htodump->{'content'})) {
my $msg = 'Error';
$msg = 'Warning' if ($KeepContentOrder);
print "$msg, bare content at root level ..\n";
} else {
my $dmp_keys = keys (%{$htodump});

if ($dmp_keys > 1) {
print "Warning, multiple elements at root level ..\n";
} else {
($key,$val) = each (%{$htodump});
my $val_type = ref($val);

if ($dmp_keys == 0 || (exists $htodump->{'comment'})) {
print "Warning, no elements at root level ..\n";
}
if ($dmp_keys == 1) {
if ($val_type eq "HASH") {
$htodump = $val if (!$KeepRoot);
}
elsif ($val_type eq "ARRAY") {
$htodump = $val if (!$KeepRoot && $KeepContentOrder);
if (!$ForceArray || scalar(@{$val}) > 1) {
print "Warning, multiple elements at root level ..\n";
}
}
}
}
}
print "\n";
my $tmp = undef;
if (ref($htodump) eq "HASH") {
$tmp = {};
%{$tmp} = %{$htodump};
} elsif (ref($htodump) eq "ARRAY") {
$tmp = [];
@{$tmp} = @{$htodump};
} else {
print "Not a hash or array!\n";
}
print Dumper($tmp) if (defined $tmp);
} else {
print "nothing to output!\n";
}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attrstr = shift;
my $ahref = {};
return $ahref unless (defined $attrstr);
while ($attrstr =~ s/[\s]*([0-9a-zA-Z]+)[\s]*=[\s]*("|')([^=]*)\2[\s]*//s) {
$ahref->{$1} = $3;
}
if ($attrstr=~/=/) {
$attrstr =~ s/^\s+//s;
$attrstr =~ s/\s+$//s;
return $attrstr
}
return $ahref;
}
##
sub getContentHash
{
my ($contstr,$hStore,$hcdata_elements) = @_;
my $ahref = {};
return $ahref unless (defined $contstr && defined $hStore && defined $hcdata_elements);
my @ary = ();
my $append_flag = 0;

while ($contstr =~ s/^([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//s)
{
## -- $1 is text contents --
if (defined $1) {
my $tmp1 = $1;
# if flagged, append it to $ary[last]
if ($append_flag && scalar(@ary) > 0) {
my $size = scalar(@ary);
$ary[$size-1] .= $tmp1;
} else {
push (@ary, $1);
}
$append_flag = 0;
}
## -- $2 is substitution index --
elsif (defined $2) {
## Exist check (Comments stripped?),
# turn on append flag.
# -----------------------------------
if (!defined $hStore->[$2]) {
$append_flag = 1;
next;
}
## CDATA check, append it to $ary[last]
# and turn on append flag.
# ---------------------------------------
if (exists $hcdata_elements->{$2}) {
my $size = scalar(@ary);
if ($size > 0) {
$ary[$size-1] .= $hStore->[$2];
} else {push (@ary, $hStore->[$2]);}
$append_flag = 1;
next;
}
$append_flag = 0;

## Substitution of in-line content,
# push it to @ary
# ----------------------------------
if ($KeepContentOrder) {
push (@ary, $hStore->[$2]);
next;
}
## Substitution of same level here (normal),
# just store it to $ahref
# -----------------------------------------
my ($key,$val) = each (%{$hStore->[$2]});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
else {} # shouldn't get here
}
# Store contents, strip out
# pure whitespace text elements
my $hary = [];
for (@ary) {
next if (/^\s+$/s);
push (@{$hary}, $_);
}
if (scalar(@{$hary}) > 0) {
$ahref->{'content'} = $hary;
}
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 && ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}

sub ProcessAltDebugInfo
{
}

__END__
 
R

robic0

On Tue, 20 Dec 2005 23:59:06 -0800, robic0 wrote:

I'm posting v906 with ':' in the regex so "xsd" can
and should be interpreted.
I did this before but I can't find it anywhere.
It was tested at one time with that code but can't gurantee
it now cause I just pushed that char in the regex and nobody
gives a shit. Btw, if you think this is mental masturbation,
this is nothing compared to this forum, the "cream" of the crop!
Should work, if it doesent, don't blama me. hahaha
As if you give a rats ass..

Here it is. I may post some grease, regex code with a new
substitution paradigm. So far, it looks as though you could
care less if I live or die. Maybe I'll chunk it down to
10 assembly lines. Hahaha. There u go .... to the bottom of
my list.

print <<EOM;

# -----------------------
# XML Regex Parser
# Version .906 - 1/5/06
# Copyright 2005,2006
# by robic0-At-yahoo.com
# -----------------------
EOM

use strict;
use warnings;
use Data::Dumper;

open DATA, "config.html" or die "can't open config.html...";
my $gabage1 = join ('', <DATA>);
close DATA;


my @xml_strings = ($gabage1);

my $alt_debug = 0;
my $VERSION = .906;
my $debug = 0;
my $rmv_white_space = 0;
my $ForceArray = 0;
my $KeepRoot = 0;
my $KeepComments = 1;
my $KeepContentOrder = 1;

## -- XML, start & end regexp substitution delimiter chars --
## match side , substitution side
## -------------------------/-------------------------------
my (@S_dlim, @E_dlim);
if ($debug) {
@S_dlim = ('\[' , '['); # use these for debug
@E_dlim = ('\]' , ']');
} else {
@S_dlim = (chr(140) , chr(140)); # use these for production
@E_dlim = (chr(141) , chr(141));
}

## -- Process xml data --
##
for (@xml_strings)
{
#print "\n",'*'x30,"\nXML string:\n",'-'x15,"\n$_\n\nOutput:\n",'-'x15,"\n\n" if ($debug);
if ($alt_debug) {
ProcessAltDebugInfo ($_) ;
print "\n";
}
my (@ROOT, %cdata_elements);
@ROOT = (); # container
%cdata_elements = ();

my ($last_cnt, $cnt, $i, $attr_error) = (-1, 1, 0, 0);

## Comment/CDATA block ==================================
#### To be done first -
# -- Questionable Comments --
while (s/(<!--(.*?)-->)/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt = Questionable comment: $1\n" if ($debug);
$ROOT[$cnt] = $1;
$cnt++;
}
#### To be done second -
# -- Real CDATA --
while (s/<!\[CDATA\[(.*?)\]\]>/$S_dlim[1]$cnt$E_dlim[1]/s)
{
# reconstitute cdata contents
my $cdata_contents = $1;
my $str = '';
while ($cdata_contents =~ s/([^$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//) {
if (defined $1) {
$str .= $1;
} elsif (defined $2 && defined $ROOT[$2]) {
$str .= $ROOT[$2];
delete $ROOT[$2];
} else {} # shouldn't get here
}
print "$cnt CDATA = $str\n" if ($debug);
$ROOT[$cnt] = $str;
$cdata_elements{$cnt} = '';
$cnt++;
}
#### To be done third -
# -- Real Comments are left --
for (my $ndx = 1; $ndx < @ROOT; $ndx++) {
if (!exists $cdata_elements{$ndx}) {
$ROOT[$ndx] =~ s/^<!--(.*?)-->$/$1/s;
print "$ndx Comment = $1\n" if ($debug);
if ($KeepComments) {
$ROOT[$ndx] = { comment => $1 };
} else {delete $ROOT[$ndx];}
}
}
## End Comment/CDATA block ==============================

#### Non-tag markups go here -
####

# -- Versioning -- <?XML-Version ?> - Placeholder, voided
while (s/<\?([^<>]*)\?>//) {
#while (s/<\?([^<>]*)\?>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt <? ?> = $1\n" if ($debug);
$ROOT[$cnt] = { 'XMLV' => $1 };
$cnt++;
}
# -- DOCTYPE -- <!DOCTYPE info> - Placeholder, voided
while (s/<!DOCTYPE([^<>]*)>//) {
#while (s/<!DOCTYPE([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt DOCTYPE = $1\n" if ($debug);
$ROOT[$cnt] = { 'DOCTYPE' => $1 };
$cnt++;
}
# -- META -- <META info> - Placeholder, voided
while (s/<META([^<>]*)>//) {
#while (s/<META([^<>]*)>/$S_dlim[1]$cnt$E_dlim[1]/) {
print "$cnt META = $1\n" if ($debug);
$ROOT[$cnt] = { 'META' => $1 };
$cnt++;
}
#### White space removal before tags ? .. TBD -
if ($rmv_white_space) {
s/>[\s]+</></g;
s/[\s]+</</g;
s/>[\s]+/>/g;
}
#### Tags here - should only need 2 iterations max
my $finished = 0;

while ($cnt != $last_cnt && $i < 20)
{
$last_cnt = $cnt;

## <Tag/> , no content
while (s/<([:0-9a-zA-Z]+)[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = \n" if ($debug);
$ROOT[$cnt] = { $1 => '' };
$cnt++;
}
## <Tag Attributes/> , no content
while (s/<([:0-9a-zA-Z]+)([\s]+[:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"])+[\s]*\/>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = attr: $2\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
$ROOT[$cnt] = { $1 => $hattrib };
$cnt++;
}
## <Tag> Content </Tag>
while (s/<([:0-9a-zA-Z]+)>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = $2\n" if ($debug);
my $unknown = '';
if (length($2) > 0) {
my $hcontent = getContentHash($2, \@ROOT, \%cdata_elements);
$unknown = $hcontent;
if (keys (%{$hcontent}) > 1) {
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
} else {
if (exists $hcontent->{'content'}) {
my ($key);
if (!$ForceArray ) {
if (ref($hcontent->{'content'}) eq "ARRAY" && scalar(@{$hcontent->{'content'}}) == 1) {
$unknown = ${$hcontent->{'content'}}[0];
}
else {$unknown = $hcontent->{'content'}; }
}
}
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
}
}
$ROOT[$cnt] = { $1 => $unknown };
$cnt++;
}
last if ($attr_error);
## <Tag Attributes> Content </Tag>
while (s/<([:0-9a-zA-Z]+)([\s]+[:0-9a-zA-Z]+[\s]*=[\s]*["'][^<]*['"][\s]*)+[\s]*>([^<]*)<[\s]*\/\1>/$S_dlim[1]$cnt$E_dlim[1]/s) {
print "$cnt <$1> = attr: $2, content: $3\n" if ($debug);
my $hattrib = getAttrHash($2);
if (ref($hattrib) ne "HASH") {
print "Invalid token in attribute asignment:\n$hattrib\n"; $attr_error = 1; last;
}
if (length($3) > 0) {
my $hcontent = getContentHash($3, \@ROOT, \%cdata_elements);
if (!$ForceArray) { adjustForSingleItemArrays ($hcontent); }
while (my ($key,$val) = each (%{$hcontent})) {
$hattrib->{$key} = $val;
}
}
$ROOT[$cnt] = { $1 => $hattrib };
$cnt++;
}
last if ($attr_error);
if ($last_cnt != $cnt) {
$i++; print "** End pass $i\n" if ($debug);
} else {
last if ($finished);
## Encapsulate the xml with a "root"
$_ = "<root>$_</root>";
$last_cnt--;
$finished = 1;
}
}
next if ($attr_error);

if (/<|>/) {
print "($i) XML problem: malformed, syntax or tag closure:\n$_";
} else {
print "** Itterations = $i\n".
"** Debug = $debug\n".
"** Rmv white space = $rmv_white_space\n".
"** ForceArray = $ForceArray\n".
"** KeepRoot = $KeepRoot\n".
"** KeepComments = $KeepComments\n".
"** KeepContentOrder = $KeepContentOrder\n";
#print Dumper($ROOT);
print "The remaining string is:\n$_\n\n" if ($debug);

## Strip off the outer element (our root) to
## examine the contents for errors.
## ---------------------------------------
my $outer_element = $cnt-1;
if (defined $ROOT[$outer_element])
{
my $hroot = $ROOT[$outer_element];
my ($key,$val) = each (%{$hroot});
my $htodump = $val;

# check for errors in root
if (ref($htodump) ne "HASH" || (!$KeepContentOrder && exists $htodump->{'content'})) {
my $msg = 'Error';
$msg = 'Warning' if ($KeepContentOrder);
print "$msg, bare content at root level ..\n";
} else {
my $dmp_keys = keys (%{$htodump});

if ($dmp_keys > 1) {
print "Warning, multiple elements at root level ..\n";
} else {
($key,$val) = each (%{$htodump});
my $val_type = ref($val);

if ($dmp_keys == 0 || (exists $htodump->{'comment'})) {
print "Warning, no elements at root level ..\n";
}
if ($dmp_keys == 1) {
if ($val_type eq "HASH") {
$htodump = $val if (!$KeepRoot);
}
elsif ($val_type eq "ARRAY") {
$htodump = $val if (!$KeepRoot && $KeepContentOrder);
if (!$ForceArray || scalar(@{$val}) > 1) {
print "Warning, multiple elements at root level ..\n";
}
}
}
}
}
print "\n";
my $tmp = undef;
if (ref($htodump) eq "HASH") {
$tmp = {};
%{$tmp} = %{$htodump};
} elsif (ref($htodump) eq "ARRAY") {
$tmp = [];
@{$tmp} = @{$htodump};
} else {
print "Not a hash or array!\n";
}
print Dumper($tmp) if (defined $tmp);
} else {
print "nothing to output!\n";
}
}
}
##
sub adjustForSingleItemArrays
{
my $href = shift;
## if $val is an array ref and has one element
## set $href->{$key} equal to the element
while (my ($key,$val) = each (%{$href})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1) {
$href->{$key} = $val->[0];
}
}
}
}
##
sub getAttrHash
{
my $attrstr = shift;
my $ahref = {};
return $ahref unless (defined $attrstr);
while ($attrstr =~ s/[\s]*([:0-9a-zA-Z]+)[\s]*=[\s]*("|')([^=]*)\2[\s]*//s) {
$ahref->{$1} = $3;
}
if ($attrstr=~/=/) {
$attrstr =~ s/^\s+//s;
$attrstr =~ s/\s+$//s;
return $attrstr
}
return $ahref;
}
##
sub getContentHash
{
my ($contstr,$hStore,$hcdata_elements) = @_;
my $ahref = {};
return $ahref unless (defined $contstr && defined $hStore && defined $hcdata_elements);
my @ary = ();
my $append_flag = 0;

while ($contstr =~ s/^([^<$S_dlim[0]$E_dlim[0]]+)|$S_dlim[0]([\d]+)$E_dlim[0]//s)
{
## -- $1 is text contents --
if (defined $1) {
my $tmp1 = $1;
# if flagged, append it to $ary[last]
if ($append_flag && scalar(@ary) > 0) {
my $size = scalar(@ary);
$ary[$size-1] .= $tmp1;
} else {
push (@ary, $1);
}
$append_flag = 0;
}
## -- $2 is substitution index --
elsif (defined $2) {
## Exist check (Comments stripped?),
# turn on append flag.
# -----------------------------------
if (!defined $hStore->[$2]) {
$append_flag = 1;
next;
}
## CDATA check, append it to $ary[last]
# and turn on append flag.
# ---------------------------------------
if (exists $hcdata_elements->{$2}) {
my $size = scalar(@ary);
if ($size > 0) {
$ary[$size-1] .= $hStore->[$2];
} else {push (@ary, $hStore->[$2]);}
$append_flag = 1;
next;
}
$append_flag = 0;

## Substitution of in-line content,
# push it to @ary
# ----------------------------------
if ($KeepContentOrder) {
push (@ary, $hStore->[$2]);
next;
}
## Substitution of same level here (normal),
# just store it to $ahref
# -----------------------------------------
my ($key,$val) = each (%{$hStore->[$2]});
if (exists $ahref->{$key}) {
push (@{$ahref->{$key}}, $val);
} else {
$ahref->{$key} = [$val];
}
}
else {} # shouldn't get here
}
# Store contents, strip out
# pure whitespace text elements
my $hary = [];
for (@ary) {
next if (/^\s+$/s);
push (@{$hary}, $_);
}
if (scalar(@{$hary}) > 0) {
$ahref->{'content'} = $hary;
}
## if $val is an array ref and has one element and it
## is a hash ref, set {$key} equal to hash ref
if (!$ForceArray) {
while (my ($key,$val) = each (%{$ahref})) {
if (ref($val) eq "ARRAY") {
if (scalar(@{$val}) == 1 && ref($val->[0]) eq "HASH") {
$ahref->{$key} = $val->[0];
}
}
}
}
return $ahref;
}

sub ProcessAltDebugInfo
{
}

__END__
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,744
Messages
2,569,482
Members
44,901
Latest member
Noble71S45

Latest Threads

Top