R
Robert Manea
Hello,
ever wondered how many different user agents are beeing used in a
newsgroup or even a whole usenet hierarchy? But didn't want to use
'the other' tools cause they draw their statistics based on wrong
criteria.
No?
Well, I did and came out with the following.
Suggestions, improvements, corrections and the like are highly welcome.
---8<---------------------------------------------------------------8<---
Example output for comp.lang.perl.misc:
1. Microsoft Outlook Express : 81
2. Mozilla : 35
3. Mozilla Thunderbird : 33
4. G2 : 32
5. slrn : 25
6. KNode : 21
7. Gnus : 19
8. Pan : 17
9. Forte Agent : 15
10. tin : 12
[...]
Summary:
- 2141 postings in total
- 30 different neawsreaders in 333 distinct postings
- Average of 5.574 articles per poster (with agent header)
- 285 without User Agent header
---8<---------------------------------------------------------------8<---
The code:
#!/bin/perl -w
#
# (c) 2004 by Robert Manea
#
# Retreive 'User-Agent' headers from usenet postings and display the
# commonnes of each newsreader. Only distinct postings count, e.g.
# every posting with the same email address counts as a single occurence
# of the according newsreader.
#
# Caveats: Postings with the same email address but different user
# agents can't be distinguished correctly
use strict;
use warnings;
use File::Find;
die "Usage: $0 <path/to/newsspool>" unless $ARGV[0];
our ( %agents, %emails );
our $cnt_file = 0;
our $no_agent = 0;
find( \&wanted, $ARGV[0] );
# Tiger Woods himself
my $max_len = '';
$max_len |= $_ foreach keys %agents;
$max_len = 1 + length $max_len;
my ( $cnt_reader, $cnt_articles ) = ( 0, 0 );
for ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) {
my $len = $max_len - length $_;
printf "%3d. %s %*s: %d\n", ++$cnt_reader, $_, $len, ' ', $agents{$_};
$cnt_articles += $agents{$_};
}
my $w_agent_avg = sprintf "%.3f", ($cnt_file - $no_agent) / $cnt_articles;
print << "EOF";
Summary:
- $cnt_file postings in total
- $cnt_reader different neawsreaders in $cnt_articles distinct postings
- Average of $w_agent_avg articles per poster (with agent header)
- $no_agent without User Agent header
EOF
sub wanted {
my $agent_header = qr/^User-Agent:|^X-User-Agent:|^X-Newsreader:|^X-Mailer:/;
my $from_header = qr/From:.*?([A-Za-z0-9\.]+@[A-Za-z0-9\.]+)\s*.*/;
my $file_name = $File::Find::name;
if ( -f $file_name && $file_name !~ /\/\./ ) {
open FH, "<$file_name" or ( warn "Cannot open $file_name: $!" and return 0);
++$cnt_file;
my ( $email, $reader );
while (<FH>) {
chomp;
if (/$from_header/) {
$email = $1;
}
elsif (/$agent_header/) {
# TODO: Faster general approach to determine the
# newsreader
my $raw_agent_str = ( split /: / )[1];
( $reader = ( split /\//, $raw_agent_str )[0] ) =~
s/( [A-Za-z]*\.*\d+\.\d+.*$)|(\(*?\[+?.*$)|(\[*?\(+?.*$)//o;
}
elsif ( $email && $reader ) {
if ( !$emails{$email} ) {
$agents{$reader}++;
$emails{$email}++;
}
last;
}
elsif (/^$/) { # Parse only header lines
++$no_agent;
last;
}
}
}
close FH;
1;
}
__END__
Thanks & Greets, Rob
ever wondered how many different user agents are beeing used in a
newsgroup or even a whole usenet hierarchy? But didn't want to use
'the other' tools cause they draw their statistics based on wrong
criteria.
No?
Well, I did and came out with the following.
Suggestions, improvements, corrections and the like are highly welcome.
---8<---------------------------------------------------------------8<---
Example output for comp.lang.perl.misc:
1. Microsoft Outlook Express : 81
2. Mozilla : 35
3. Mozilla Thunderbird : 33
4. G2 : 32
5. slrn : 25
6. KNode : 21
7. Gnus : 19
8. Pan : 17
9. Forte Agent : 15
10. tin : 12
[...]
Summary:
- 2141 postings in total
- 30 different neawsreaders in 333 distinct postings
- Average of 5.574 articles per poster (with agent header)
- 285 without User Agent header
---8<---------------------------------------------------------------8<---
The code:
#!/bin/perl -w
#
# (c) 2004 by Robert Manea
#
# Retreive 'User-Agent' headers from usenet postings and display the
# commonnes of each newsreader. Only distinct postings count, e.g.
# every posting with the same email address counts as a single occurence
# of the according newsreader.
#
# Caveats: Postings with the same email address but different user
# agents can't be distinguished correctly
use strict;
use warnings;
use File::Find;
die "Usage: $0 <path/to/newsspool>" unless $ARGV[0];
our ( %agents, %emails );
our $cnt_file = 0;
our $no_agent = 0;
find( \&wanted, $ARGV[0] );
# Tiger Woods himself
my $max_len = '';
$max_len |= $_ foreach keys %agents;
$max_len = 1 + length $max_len;
my ( $cnt_reader, $cnt_articles ) = ( 0, 0 );
for ( sort { $agents{$b} <=> $agents{$a} } keys %agents ) {
my $len = $max_len - length $_;
printf "%3d. %s %*s: %d\n", ++$cnt_reader, $_, $len, ' ', $agents{$_};
$cnt_articles += $agents{$_};
}
my $w_agent_avg = sprintf "%.3f", ($cnt_file - $no_agent) / $cnt_articles;
print << "EOF";
Summary:
- $cnt_file postings in total
- $cnt_reader different neawsreaders in $cnt_articles distinct postings
- Average of $w_agent_avg articles per poster (with agent header)
- $no_agent without User Agent header
EOF
sub wanted {
my $agent_header = qr/^User-Agent:|^X-User-Agent:|^X-Newsreader:|^X-Mailer:/;
my $from_header = qr/From:.*?([A-Za-z0-9\.]+@[A-Za-z0-9\.]+)\s*.*/;
my $file_name = $File::Find::name;
if ( -f $file_name && $file_name !~ /\/\./ ) {
open FH, "<$file_name" or ( warn "Cannot open $file_name: $!" and return 0);
++$cnt_file;
my ( $email, $reader );
while (<FH>) {
chomp;
if (/$from_header/) {
$email = $1;
}
elsif (/$agent_header/) {
# TODO: Faster general approach to determine the
# newsreader
my $raw_agent_str = ( split /: / )[1];
( $reader = ( split /\//, $raw_agent_str )[0] ) =~
s/( [A-Za-z]*\.*\d+\.\d+.*$)|(\(*?\[+?.*$)|(\[*?\(+?.*$)//o;
}
elsif ( $email && $reader ) {
if ( !$emails{$email} ) {
$agents{$reader}++;
$emails{$email}++;
}
last;
}
elsif (/^$/) { # Parse only header lines
++$no_agent;
last;
}
}
}
close FH;
1;
}
__END__
Thanks & Greets, Rob