sorry if it's a naive question, but I never needed up to now to deal
with anything beyond a-zA-Z0-9....
now I have a long text with standard ascii and, sometimes, cyrillic
text - and I want to filter it out (the cyrillic, of course).
I tried with
perl -wne 'foreach($_){if (/(\p{InCyrillic})/){print"record $_ matches
but it fails to recognize it - what am I doing wrong?
thanks for any help...
alessandro
Try this on your file and post the output.
Assign $fname = 'your file' and run it.
-sln
# File: bomtest.pl
#
# CheckUTF() -
# UTF-8/16/32 check, Will add a guessed utf-16/32 layer
# if there is no current encoding() layer.
# Opened filehandle should be in byte context.
# This is more for automatically adding a utf layer on
# unknown files opened in byte mode.
# Guesses utf-8/16/32 encodings, BOMb'd or not.
# Uses Encode::Guess module. Will use standard guess settings (see docs).
# Used standalone or optionally with Get_BOM().
#
# Get_BOM() -
# SEEK_SET workaround for BOMb'd Encoding(utf-) layer Bug
# and a general way to get the BOM and offset for subsequent
# seeks.
# The file can be opened using any encoding. The handle passed
# is duped, that handle set to ':raw'. No character semantics
# are used when analysing the BOM.
# This will also detect a mismatch in width between the
# detected BOM and the current opened encoding layer.
# Used standalone or optionally with CheckUTF().
# CheckUTF() has an option to get the bom on the callers
# behalf, combining steps.
#
# test_surrogate_pair() -
# Just a utility function to print hi/lo surrogate pairs
# given a unicode test character (> 0x10000).
#
#
# Modify the top variables to test various configurations.
# Includes:
# $fname File name for output and input
# $write_file Flag 0/1 to create a test file
# (0 to not write, just read file $fname)
# $data Data string for writing test file, can be unicode
# $enc Encoding to read/write file $fname
# $test_surr Stand-alone surrogate pair test
#
# -sln, 8/13/09
# --------------------------------------------------------
use strict;
use warnings;
binmode (STDOUT, ':utf8');
{
# my $fname = 'dummy.txt';
my $fname = 'c:\temp\XML\CollectedData_1804.xml';
my $Unichar = "\x{21000}";
my $data = "\x{1}\x{2}$Unichar";
my $enc = ''; # blank or UTF-xxLE/BE or :bytes, :utf8, :raw, etc ...
my $enc_string = ($enc =~ /:/) ? "$enc" : (length($enc) ? "encoding($enc)" : '');
my $write_file = 0;
unless ($write_file eq 'dummy.txt') { $write_file = 0; }
my $test_surr = 0;
my ($fh, $line);
my ($width,$encfound,$bom,$offset);
# Test surrogate pairs ..
##
test_surrogate_pair($Unichar) if $test_surr;
# Create a test file with encoding ..
##
if ($write_file) {
print "\nWriting sample data as '$enc' to $fname\n",'-'x20,"\n";
open $fh, ">$enc_string", $fname or die "can't open $fname for write $!";
print $fh $data;
close $fh;
} else {
print "\nNOT WRITING to $fname !!\n",'-'x20,"\n";
}
# Read test file with encoding ..
##
print "\nReading $fname as '$enc'\n",'-'x20,"\n";
open $fh, "<$enc_string", $fname or die "can't open $fname for read $!";
$offset = CheckUTF($fh, 'size'=> 100, 'verbose'=> 1, 'getbom_v'=> 1);
print "\n";
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ".ordsplit($line)."\n";
print "$line\n";
}
close $fh;
# Read test file with encoding into scalar, open handle to scalar as :utf8 ..
##
print "\nBuffering $fname as '$enc', open/read buffer as ':utf8'\n",'-'x20,"\n";
open $fh, "<$enc_string", $fname or die "can't open $fname for read $!";
$offset = CheckUTF($fh, 'size'=> 100, 'verbose'=> 1, 'getbom_v'=> 1);
seek($fh, $offset, 0); # SEEK_SET
my $membuf = <$fh>;
close $fh;
open $fh, "<:utf8", \$membuf or die "can't open \$membuf for read $!";
$offset = 0;
print "\n";
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ".ordsplit($line)."\n";
}
close $fh;
# Read test file in byte mode ..
##
print "\nReading $fname as bytes\n",'-'x20,"\n";
open $fh, '<', $fname or die "can't open $fname for read $!";
$offset = 0;
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ";
for (map {ord $_} split //, $line) {
printf ("%x ",$_);
}
print "\n";
}
close $fh;
}
exit 0;
##
## End of program ...
##
sub ordsplit
{
my $string = shift;
my $buf = '';
for (map {ord $_} split //, $string) {
$buf.= sprintf ("%x ",$_);
}
return $buf;
}
sub Get_BOM
{
my ($fh_orig, $verbose) = @_;
$verbose = 0 unless (defined $verbose and lc($verbose) eq 'v');
use Encode;
my %bom2enc = (
map { encode($_, "\x{feff}") => $_ } qw(
UTF-8
UTF-16BE
UTF-16LE
UTF-32BE
UTF-32LE
)
);
my %enc2bom = (
reverse(%bom2enc),
map { $_ => encode($_, "\x{feff}") } qw(
UCS-2
iso-10646-1
utf8
)
);
my @bombs = sort { length $b <=> length $a } keys %bom2enc;
my $MAX_BOM_LENGTH = length $bombs[0];
my $bomstr = join '|', @bombs;
my $bom_re = qr/^($bomstr)/o;
my @arlays = PerlIO::get_layers($fh_orig);
my $layers = "@arlays";
print "Get_BOM() - Layers = $layers\n" if $verbose;
if (defined($fh_orig) && (ref($fh_orig) eq 'GLOB' || ref(\$fh_orig) eq 'GLOB'))
{
# Dup the passed in handle, binmode it to raw
# We will ONLY do byte semantics !!
# --------------------------------------------------
open my $fh_dup, "<&", $fh_orig or die "can't dup filehandle: $!";
binmode ($fh_dup, ":raw");
if (seek($fh_dup, 0, 0)) # SEEK_SET
{
my $sample = '';
## Read in $MAX_BOM_LENGTH characters.
## As a check, store character/bytes read based
## on the differential file position per read.
## Since our dup handle is in byte mode, the bytes
## per character will always be 1.
## This is a hold over when there could be encoded
## characters and is not necessary.
## -----------------------------------------------
my $fpos = tell($fh_dup);
my @samparray = ();
print " * Reading up to $MAX_BOM_LENGTH characters\n" if $verbose;
for (1 .. $MAX_BOM_LENGTH)
{
my $count = read ($fh_dup, my $buff, 1);
last if (!defined $count or $count != 1);
push @samparray, {'char'=> $buff, 'width'=> (tell($fh_dup)-$fpos)};
$fpos = tell($fh_dup);
}
$sample = '';
for my $href (@samparray) {
$sample .= $href->{'char'};
printf (" - char %x , bytes = %d\n", ord($href->{'char'}), $href->{'width'}) if $verbose;
}
if ($verbose) {
print " - Read ".length($sample)." characters. Position = $fpos\n";
print " - Sample (".length($sample).") = ";
for (map {ord $_} split //, $sample) {
printf ("%02x ", $_);
}
print "\n * Analysing bytes\n";
}
## Convert the 'characters' to bytes.
## Only process $MAX_BOM_LENGTH bytes.
## We only care about the BOM, which will match
## the encoding bytes we already have in %bom2enc.
## -----------------------------------------------
my ($octets, $numb_octets, $vecoffset) = ('',0,0);
for my $href (@samparray)
{
my @ar = ();
my ($charwidth, $ordchar) = (
$href->{'width'},
ord( $href->{'char'})
);
last if (($numb_octets + $charwidth) > $MAX_BOM_LENGTH);
$numb_octets += $charwidth;
while ($ordchar > 0) {
push @ar, $ordchar & 0xff;
$ordchar >>= 8;
--$charwidth;
}
push (@ar,0) while ($charwidth-- > 0);
for (reverse @ar) {
vec ($octets, $vecoffset++, 8) = $_;
}
}
if ($verbose) {
print " BOMS avail = ";
for my $bomavail (@bombs) {
print '( ';
for (map {ord $_} split //, $bomavail) {
printf ("%02x",$_);
}
print ' ) ';
}
print "\n - Bom bytes from sample (".length($octets).") = ";
for (map {ord $_} split //, $octets) {
printf ("%02x ", $_);
}
print "\n";
}
# end convert
my ($bomenc, $bom, $bomwidth, $bomoffset) = ('','','',0);
if (my ($found) = $octets =~ /$bom_re/) {
($bomenc, $bomoffset) = (
$bom2enc{$found},
length($found)
);
for (map {ord $_} split //, $found) {
$bom .= sprintf ("%02x", $_);
}
print " * Found $bomenc, BOM = $bom\n" if ($verbose);
my $noendian = $bomenc;
$noendian =~ s/(?:LE|BE)$//i;
$bomwidth = ($layers =~ /$noendian/i) ? 'ok' : 'mismatch';
print " - Does not match width of current encoding layer\n" if ($verbose and $bomwidth ne 'ok');
} else {
print " * BOM Not found\n" if $verbose;
}
close ($fh_dup);
# Original handle -> Must SEEK_SET/read twice to
# mitigate UTF-16/32 BOM seek bug
# ----------------------------------------------
seek($fh_orig, 0, 0); # SEEK_SET
read ($fh_orig, $sample, 1); # read
seek($fh_orig, 0, 0); # SEEK_SET last. Caller to seek past BOM.
return ($bomenc, $bom, $bomwidth, $bomoffset);
}
close ($fh_dup);
# seek failed, fall through
}
# There was an error ..
if ($verbose) {
print " * Invalid filehandle or file is unseekable\n - Utf BOMs available (max length = $MAX_BOM_LENGTH):\n";
while (my ($key,$val) = each %enc2bom)
{
my $valstring = '';
for (split //, $val) {
$valstring .= sprintf ("%x ",ord $_);
}
print " $key = $bom2enc{$val} = $valstring\n";
#print " $key = $bom2enc{$val} = '$val' = '$enc2bom{$key}' = $valstring\n";
}
}
return ('','','',-1);
}
sub CheckUTF
{
# UTF-8/16/32 check
# Will add a layer if there is no current
# encoding() layer. So open filehandle should
# be in byte context.
# ----------------------------------------
# Result value:
# -1 - not filehandle (or undef'ed)
# 0 - read error or empty file
# 1 - already have encoding layer (could force change later)
# 2 - guess_encoder() error message
# 3 - do nothing unless utf-16/32(be/le)
# 4 - added a layer
my ($fh_check,@args) = @_;
my %parm = ('verbose'=>0, 'size'=>60, 'getbom' =>0, 'bomopts'=>'');
while (my ($name, $val) = splice (@args, 0, 2))
{
$name =~ s/^\s+|\s+$//;
$name = lc $name;
next if not defined $val;
if ( $name =~ /^getbom(?:_(\w*))?/ and $val) {
$parm{'gbom'} = 1;
$parm{'bomopts'} = $1 if defined($1);
}
elsif ($name eq 'verbose' || $name eq 'size' && $val > 4) {
$parm{$name} = $val;
}
}
# 0 1 2 3 4 5 6
my ($Res, $Utfmsg, $layers, $bomenc, $bom, $bomwidth, $bomoffset) = (
0, 'UTF Check', '', '', '', '', 0);
$layers = ':'.join (':', PerlIO::get_layers($fh_check)).':';
if (!defined($fh_check) || (ref($fh_check) ne 'GLOB' && ref(\$fh_check) ne 'GLOB')) {
$Utfmsg .= ", not a filehandle";
print "$Utfmsg\n" if $parm{'verbose'};
$Res = -1;
}
elsif ($layers =~ /:encoding/) {
$Utfmsg .= ", already have encoding layer";
$Res = 1;
}
else {
my ($count, $sample);
my $utf8layer = $layers =~ /:utf8/;
binmode ($fh_check,":bytes");
seek ($fh_check, 0, 0); # SEEK_SET
# Try to read a large sample
if (!defined($count = read ($fh_check,$sample,$parm{'size'},0))) {
$Utfmsg .= ". $!"; # read error
} elsif ($count <= 0) {
$Utfmsg .= ". File is empty";
} else {
seek ($fh_check, 0, 0); # SEEK_SET
use Encode::Guess;
my $decoder = guess_encoding ($sample); # ascii/utf8/BOMed UTF
$Res = 2;
if (ref($decoder)) {
my $name = $decoder->name;
$decoder = 'Do nothing';
$Res = 3;
$Utfmsg .= ", guess($count): $name";
if ($name =~ /UTF.*?(?:16|32)/i) {
# $name =~ s/(?:LE|BE)$//i;
$decoder = "Adding layer";
$Res = 4;
binmode ($fh_check, ":encoding($name)");
}
}
$Utfmsg .= ". $decoder" if (defined $decoder); # guess error
}
binmode ($fh_check,":utf8") if $utf8layer;
}
$Utfmsg .= ' ..';
$layers = "@{[PerlIO::get_layers($fh_check)]}";
print "@{[$Utfmsg,$layers]}\n" if $parm{'verbose'};
$bomoffset = $Res if $Res == -1;
if ($Res != -1 && $parm{'gbom'}) {
($bomenc, $bom, $bomwidth, $bomoffset) = Get_BOM ($fh_check, $parm{'bomopts'});
$Utfmsg .= " bom($bomoffset) = $bom ..";
print "Get_BOM returned $bomenc, $bom, $bomwidth, $bomoffset\n" if $parm{'verbose'};
}
return ($Res, $Utfmsg, $layers, $bomenc, $bom, $bomwidth, $bomoffset);
}
sub test_surrogate_pair
{
my $test = shift;
print "\nTesting surrogate pairs\n",'-'x20,"\n";
if (ord($test) < 0x10000) {
print "nothing to test < 0x10000\n";
} else {
my $hi = (ord($test) - 0x10000) / 0x400 + 0xD800;
my $lo = (ord($test) - 0x10000) % 0x400 + 0xDC00;
my $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
printf "test uni = %x\n", ord($test);
printf "hi surrogate = %x\n", $hi;
printf "lo surrogate = %x\n", $lo;
printf "uni = %x\n", $uni;
}
}
__END__
c:\temp\new_v3c>perl bomtest.pl
NOT WRITING to c:\temp\XML\CollectedData_1804.xml !!
--------------------
Reading c:\temp\XML\CollectedData_1804.xml as ''
--------------------
UTF Check, guess(100): UTF-16. Adding layer .. unix crlf encoding(UTF-16) utf8
Get_BOM() - Layers = unix crlf encoding(UTF-16) utf8
* Reading up to 4 characters
- char ff , bytes = 1
- char fe , bytes = 1
- char 3c , bytes = 1
- char 0 , bytes = 1
- Read 4 characters. Position = 4
- Sample (4) = ff fe 3c 00
* Analysing bytes
BOMS avail = ( fffe0000 ) ( 0000feff ) ( efbbbf ) ( fffe ) ( feff )
- Bom bytes from sample (4) = ff fe 3c 00
* Found UTF-16LE, BOM = fffe
Get_BOM returned UTF-16LE, fffe, ok, 2
Pass 1: seek 2/2 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
<?xml version="1.0" encoding="unicode"?>
Pass 2: seek 2/2 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
<?xml version="1.0" encoding="unicode"?>
Pass 3: seek 2/2 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
<?xml version="1.0" encoding="unicode"?>
Buffering c:\temp\XML\CollectedData_1804.xml as '', open/read buffer as ':utf8'
--------------------
UTF Check, guess(100): UTF-16. Adding layer .. unix crlf encoding(UTF-16) utf8
Get_BOM() - Layers = unix crlf encoding(UTF-16) utf8
* Reading up to 4 characters
- char ff , bytes = 1
- char fe , bytes = 1
- char 3c , bytes = 1
- char 0 , bytes = 1
- Read 4 characters. Position = 4
- Sample (4) = ff fe 3c 00
* Analysing bytes
BOMS avail = ( fffe0000 ) ( 0000feff ) ( efbbbf ) ( fffe ) ( feff )
- Bom bytes from sample (4) = ff fe 3c 00
* Found UTF-16LE, BOM = fffe
Get_BOM returned UTF-16LE, fffe, ok, 2
Pass 1: seek 0/0 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
Pass 2: seek 0/0 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
Pass 3: seek 0/0 ok, data (42) = 3c 3f 78 6d 6c 20 76 65 72 73 69 6f
6e 3d 22 31 2e 30 22 20 65 6e 63 6f 64 69 6e 67 3d 22 75 6e
69 63 6f 64 65 22 3f 3e d a
Reading c:\temp\XML\CollectedData_1804.xml as bytes
--------------------
Pass 1: seek 0/0 ok, data (85) = ff fe 3c 0 3f 0 78 0 6d 0 6c 0 2
0 0 76 0 65 0 72 0 73 0 69 0 6f 0 6e 0 3d 0 22 0 31 0 2e
0 30 0 22 0 20 0 65 0 6e 0 63 0 6f 0 64 0 69 0 6e 0 67 0
3d 0 22 0 75 0 6e 0 69 0 63 0 6f 0 64 0 65 0 22 0 3f 0 3e
0 d 0 a
Pass 2: seek 0/0 ok, data (85) = ff fe 3c 0 3f 0 78 0 6d 0 6c 0 2
0 0 76 0 65 0 72 0 73 0 69 0 6f 0 6e 0 3d 0 22 0 31 0 2e
0 30 0 22 0 20 0 65 0 6e 0 63 0 6f 0 64 0 69 0 6e 0 67 0
3d 0 22 0 75 0 6e 0 69 0 63 0 6f 0 64 0 65 0 22 0 3f 0 3e
0 d 0 a
Pass 3: seek 0/0 ok, data (85) = ff fe 3c 0 3f 0 78 0 6d 0 6c 0 2
0 0 76 0 65 0 72 0 73 0 69 0 6f 0 6e 0 3d 0 22 0 31 0 2e
0 30 0 22 0 20 0 65 0 6e 0 63 0 6f 0 64 0 69 0 6e 0 67 0
3d 0 22 0 75 0 6e 0 69 0 63 0 6f 0 64 0 65 0 22 0 3f 0 3e
0 d 0 a
c:\temp\new_v3c>