Peter said:
Perhaps it's not just your regexes... as Andrew said, "You should post
the actual code ..." (Preferably both the Perl and the Python, if you
are trying to make this into a them vs. us kind of thing, as you
seem to be doing.)
Well, commenting out the regex substitutions in both versions leads to
equal execution times, as I mentioned earlier, so it has to be my regexes.
Anyway, for your pleasure <wink>, the code (it takes a list of newsgroup
postings and calculates a ratio followups/postings):
------------------------------------------------------------------
At first the perl version (not coded by me):
------------------------------------------------------------------
#!/usr/bin/perl -w
use strict;
die unless @ARGV;
my (%post,%fup);
while (<STDIN>)
{
chomp;
my (undef,$from,$mid,$ref) = split /\|/;
$from =~ s|\s*<.*>\s*||;
$from = $1 if $from =~ m|\((.*)\)|;
$from =~ s|^"(.*)"$|$1|;
my $arr = $post{$from} ||= [];
push @$arr,$mid;
$fup{$ref}++;
}
my @arr;
for (keys %post)
{
my $arr = $post{$_};
my $m = 0;
$m += $fup{$_} || 0 for @$arr;
my $n = @$arr;
push @arr,[$m/$n,$n,$m,$_];
}
@arr = sort { $b->[1] <=> $a->[1] } @arr;
$#arr = $ARGV[0]-1;
print "Pos Relevanz Posts F'ups Poster\n";
print "--- -------- ----- ----- ------------------\n";
my $i = 1;
for (sort { $b->[0] <=> $a->[0] || $b->[1] <=> $a->[1] } @arr)
{
printf "%2d) %2.4f %4d %4d %s\n",$i++,@$_;
}
------------------------------------------------------------------
Original Python translation (with regexes):
------------------------------------------------------------------
#!/usr/bin/python
import sys, os, re
if len(sys.argv) == 1:
sys.exit(0)
post = {}
fup = {}
re1 = re.compile(r"\s*<.*>\s*")
re2 = re.compile(r".*\((.*)\).*")
re3 = re.compile(r'^"(.*)"$')
for line in sys.stdin:
line = line.rstrip("\n")
try:
nix, fro, mid, ref = line.split("|")
except: continue
fro = re1.sub("", fro)
fro = re2.sub(r"\1", fro)
fro = re3.sub(r"\1", fro)
post.setdefault(fro, []).append(mid)
if ref in fup: fup[ref] += 1
else: fup[ref] = 1
res = []
for name, arr in post.iteritems():
m = 0
for item in arr:
if item in fup:
m += fup[item]
n = len(arr)
res.append((float(m)/n, n, m, name))
res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]
print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"
res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
print "%2d)" % i, " %2.4f %4d %4d %s" % item
i += 1
------------------------------------------------------------------
Optimized Python (without regexes):
------------------------------------------------------------------
#!/usr/bin/python
import sys, os
if len(sys.argv) == 1:
sys.exit(0)
post = {}
fup = {}
def inparens(text):
i = text.find("(")
if i == -1: return text
j = text.rfind(")")
if j < i: return text
return text[i+1:j]
def removeangles(text):
i = text.find("<")
if i == -1: return text
j = text.rfind(">")
if j < i: return text
return text[0:i].rstrip() + text[j+1:].lstrip()
for line in sys.stdin:
line = line.rstrip("\n")
try:
nix, fro, mid, ref = line.split("|")
except: continue
fro = inparens(removeangles(fro))
if fro.startswith('"'):
if fro.endswith('"'):
fro = fro[1:-1]
post.setdefault(fro, []).append(mid)
if ref in fup: fup[ref] += 1
else: fup[ref] = 1
res = []
for name, arr in post.iteritems():
m = 0
for item in arr:
if item in fup:
m += fup[item]
n = len(arr)
res.append((float(m)/n, n, m, name))
res.sort(lambda x, y: cmp(y[1], x[1]))
del res[int(sys.argv[1]):]
print "Pos Relevanz Posts F'ups Poster"
print "--- -------- ----- ----- ------------------"
res.sort(lambda x, y: cmp(y[0], x[0]) or cmp(y[1], x[1]))
i = 1
for item in res:
print "%2d)" % i, " %2.4f %4d %4d %s" % item
i += 1