S
sln
* warning, tired person, large source code ahead ..
------------------------------------------------------
I don't guess they fixed this, and probably won't.
The failure of Encode to seek past the BOM on subsequent reads
then SEEK_SET 0 isin't something unique to Perl. Python has the same bug.
Here is a way to do it by analysing the BOM in the character semantics
of the encoding layer on the open handle without touching any layers
with binmode. The result is a conversion of a BOM into byte string
that can be compared to the octets of the Unicode family of encodings.
These octets of all the encoded BOM's are then simply read as keys to
a hash.
BOM's are unique in thier encoding's and file position so there can
be no ambiguity when analysing them in any particular read encoding
character semantic. Whatever the encoding, enough characters are
read (1 at a time, up to the Max_Bom_Width octet count) to convert
its ordinal value into the first Max_Bom_Width bytes of the characters.
The width is gleaned withe each read from a file position differential.
The bytes from each character ordinal are taken off in a loop with
a bitwise & 0xff then a shift >>= 8 on the ordinal until its zero.
0x0 is padded until the byte count equals the width.
Get the next character, then do it again until the total bytes
in the conversion adds up to Max_Bom_Width bytes (octets).
I threw this together from another project where I needed to do
this. I didn't really want to have to do this but was forced to
because of other requirements.
For better or worse, here it is. It works.
I thought I would throw it up here incase anybody else needs this
solution besides me.
-sln
--------------------------
# File: bomtest.pl
#
# CheckUTF() -
# UTF-8/16/32 check, Will add a guessed utf-16/32 layer
# if there is no current encoding() layer.
# Opened filehandle should be in byte context.
# This is more for automatically adding a utf layer on
# unknown files opened in byte mode.
# Only guesses utf-8/16/32 encodings, BOMb'd or not.
# Used standalone or optionally with Get_BOM().
#
# Get_BOM() -
# SEEK_SET workaround for BOMb'd Encoding(utf-) layer Bug
# and a general way to get the BOM and offset for subsequent
# seeks.
# Fixes SEEK_SET if need to itterate reading of BOMb'd file.
# Once called, subsequent seek calls are positionalble past
# the BOM from the offset returned. Only needs to be called
# once and preferably after a call to optional CheckUTF().
# Does not alter the PerlIO layers in any way.
# The BOM is determined with character semantics
# (that was the hard part!), a file opened in byte semantics
# is just more gravy.
# This will also detect a mismatch in width between the
# detected BOM and the current opened encoding layer.
# Used standalone or optionally with CheckUTF().
#
# test_surrogate_pair() -
# Just a utility function to print hi/lo surrogate pairs
# given a unicode test character (> 0x10000).
#
#
# Modify the top variables to test various configurations.
# It basically creates a test file in an encoding, reads it
# back in that encoding. Then reads it back in byte mode.
# The write flag can be set so that just reading occurs.
#
#
# -sln, 8/4/09
# --------------------------------------------------------
use strict;
use warnings;
binmode (STDOUT, ':utf8');
my $fname = 'dummy.txt';
my $Unichar = "\x{21000}";
my $data = "\x{1}\x{2}$Unichar";
my $enc = 'UTF-16'; # blank or UTF-xxLE/BE or :bytes, :utf8, :raw, etc ...
my $enc_string = ($enc =~ /:/) ? "$enc" : (length($enc) ? "encoding($enc)" : '');
my $write_file = 1;
my $test_surr = 0;
my ($fh, $line);
my ($width,$bom,$offset);
# Test surrogate pairs ..
#
test_surrogate_pair($Unichar) if $test_surr;
# Create a test file with encoding ..
#
if ($write_file) {
print "\nWriting sample data as '$enc' to $fname\n",'-'x20,"\n";
open $fh, ">$enc_string", $fname or die "can't open $fname for write $!";
print $fh $data;
close $fh;
}
# Read test file with encoding ..
#
print "\nReading $fname as '$enc'\n",'-'x20,"\n";
open $fh, "<$enc_string", $fname or die "can't open $fname for read $!";
CheckUTF($fh,'size'=> 100, 'verbose'=> 1);
($width,$bom,$offset) = Get_BOM( $fh, 'v' ); # list context, verbose prints
# $offset = Get_BOM( $fh ); # scalar context, no prints
print "\nGet_BOM returned $bom, $width, offset = $offset\n";
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ";
for (map {ord $_} split //, $line) {
printf ("%x ",$_);
}
print "\n";
}
close $fh;
# Read test file in byte mode ..
#
print "\nReading $fname as bytes\n",'-'x20,"\n";
open $fh, '<', $fname or die "can't open $fname for read $!";
$offset = 0;
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ";
for (map {ord $_} split //, $line) {
printf ("%x ",$_);
}
print "\n";
}
close $fh;
exit 0;
##
## End of program ...
##
sub Get_BOM
{
my ($fh, $verbose) = @_;
$verbose = 0 unless (defined $verbose and $verbose);
use Encode;
my %bom2enc = (
map { encode($_, "\x{feff}") => $_ } qw(
UTF-8
UTF-16BE
UTF-16LE
UTF-32BE
UTF-32LE
)
);
my %enc2bom = (
reverse(%bom2enc),
map { $_ => encode($_, "\x{feff}") } qw(
UCS-2
iso-10646-1
utf8
)
);
my @bombs = sort { length $b <=> length $a } keys %bom2enc;
my $MAX_BOM_LENGTH = length $bombs[0];
my $bomstr = join '|', @bombs;
my $bom_re = qr/^($bomstr)/o;
my $layers = "@{[PerlIO::get_layers($fh)]}";
print "\nGet_BOM() - Layers = $layers\n" if $verbose;
if (defined($fh) && (ref($fh) eq 'GLOB' || ref(\$fh) eq 'GLOB'))
{
# Note - Must SEEK_SET/read twice to
# mitigate UTF-16/32 BOM seek bug
# -----------------------------------
if (seek($fh, 0, 0))
{
my $sample = '';
read ($fh, $sample, 1);
seek($fh, 0, 0); # SEEK_SET
my $fpos = tell($fh);
my @samparray = ();
## Read in $MAX_BOM_LENGTH 'characters'
## in whatever encoding the file was opened with.
## Store each 'character' and bytes/char - based
## on the differential file position per read.
## A BOM can be sucessfully read un-alterred in
## 1,2,4 or 8 byte sequences.
## -------------------------------------------
for (1 .. $MAX_BOM_LENGTH)
{
my $count = read ($fh, my $buff, 1);
last if (!defined $count or $count != 1);
push @samparray, {'char'=> $buff, 'width'=> (tell($fh)-$fpos)};
$fpos = tell($fh);
}
$sample = '';
print ">> Reading up to $MAX_BOM_LENGTH characters\n" if $verbose;
for my $aref (@samparray) {
$sample .= $aref->{'char'};
printf (" - char %x , bytes = %d\n", ord($aref->{'char'}), $aref->{'width'}) if $verbose;
}
seek($fh, 0, 0); # SEEK_SET, last seek, set to position 0
if ($verbose) {
print " - Read ".length($sample)." characters. Position = $fpos\n";
print " - Sample (".length($sample).") = ";
for (map {ord $_} split //, $sample) {
printf ("%02x ", $_);
}
print "\n>> Converting to bytes\n";
}
## Convert the 'characters' to bytes.
## Only process $MAX_BOM_LENGTH bytes (octets).
## We only care about the BOM, which will match
## the encoding bytes we already have in %bom2enc.
## -----------------------------------------------
my ($octets, $offset) = ('',0);
my $numb_octets = 0;
for my $aref (@samparray)
{
my @ar = ();
my ($width, $ordchar) = (
$aref->{'width'},
ord( $aref->{'char'})
);
last if (($numb_octets + $width) > $MAX_BOM_LENGTH);
$numb_octets += $width;
while ($ordchar > 0) {
push @ar, $ordchar & 0xff;
$ordchar >>= 8;
--$width;
}
push (@ar,0) while ($width-- > 0);
for (reverse @ar) {
vec ($octets, $offset++, 8) = $_;
}
}
if ($verbose) {
print " BOMS avail = ";
for my $bom (@bombs) {
print '( ';
for (map {ord $_} split //, $bom) {
printf ("%02x",$_);
}
print ' ) ';
}
print "\n - Bom octets from sample (".length($octets).") = ";
for (map {ord $_} split //, $octets) {
printf ("%02x ", $_);
}
print "\n";
}
# end convert
if (my ($found) = $octets =~ /$bom_re/) {
my ($offset, $bomstring) = (
length($found),
$bom2enc{$found}
);
if ($verbose) {
print ">> Found $bomstring, BOM = ";
for (map {ord $_} split //, $found) {
printf ("%02x", $_);
}
print "\n";
}
my $noendian = $bomstring;
$noendian =~ s/(?:LE|BE)$//i;
my $width = ($layers =~ /$noendian/i) ? 'ok' : 'mismatch';
print " - Does not match width of current encoding layer\n" if ($verbose and $width ne 'ok');
return ($width, $bomstring,$offset);
}
print ">> BOM Not found\n" if $verbose;
return ('','',0);
}
# can't seek here, fall through
}
# There was an error ..
print "\nGet_BOM() - Layers = $layers\n" if (!$verbose);
print ">> Invalid filehandle or file is unseekable\n - Utf BOMs available (max length = $MAX_BOM_LENGTH):\n";
while (my ($key,$val) = each %enc2bom)
{
my $valstring = '';
for (split //, $val) {
$valstring .= sprintf ("%x ",ord $_);
}
print " $key = $bom2enc{$val} = $valstring\n";
#print " $key = $bom2enc{$val} = '$val' = '$enc2bom{$key}' = $valstring\n";
}
return ('','',-1);
}
sub CheckUTF
{
# UTF-8/16/32 check
# Will add a layer if there is no current
# encoding() layer. So open filehandle should
# be in byte context.
# ----------------------------------------
# Result value:
# -1 - not filehandle (or undef'ed)
# 0 - read error or empty file
# 1 - already have encoding layer (could force change later)
# 2 - guess_encoder() error message
# 3 - do nothing unless utf-16/32(be/le)
# 4 - added a layer
my ($fh,@args) = @_;
my %parm = ('verbose'=> 0, 'size'=> 60);
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+|\s+$//; next if not defined $val;
if (lc($name) eq 'verbose' || (lc($name) eq 'size' && $val > 4))
{ $parm{$name} = $val; }
}
my ($Res, $Utfmsg, $Layers) = (0, 'UTF Check', '');
if (!defined($fh) || (ref($fh) ne 'GLOB' && ref(\$fh) ne 'GLOB')) {
$Utfmsg .= ", not a filehandle ..";
print "$Utfmsg\n" if $parm{'verbose'};
return wantarray ? ($Res,$Utfmsg,'') : $Utfmsg;
}
$Layers = ':'.join (':', PerlIO::get_layers($fh)).':';
if ($Layers =~ /:encoding/) {
$Utfmsg .= ", already have encoding layer";
$Res = 1;
} else {
my ($count, $sample);
my $utf8layer = $Layers =~ /:utf8/;
binmode ($fh,":bytes");
seek ($fh, 0, 0);
# Try to read a large sample
if (!defined($count = read ($fh,$sample,$parm{'size'},0))) {
$Utfmsg .= ". $!"; # read error
} elsif ($count <= 0) {
$Utfmsg .= ". File is empty";
} else {
seek ($fh, 0, 0); # SEEK_SET
use Encode::Guess;
my $decoder = guess_encoding ($sample); # ascii/utf8/BOMed UTF
$Res = 2;
if (ref($decoder)) {
my $name = $decoder->name;
$decoder = 'Do nothing';
$Res = 3;
$Utfmsg .= ", guess($count read): $name";
if ($name =~ /UTF.*?(?:16|32)/i) {
# $name =~ s/(?:LE|BE)$//i;
$decoder = "Adding layer";
$Res = 4;
binmode ($fh, ":encoding($name)");
}
}
$Utfmsg .= ". $decoder" if (defined $decoder); # guess error
}
binmode ($fh,":utf8") if $utf8layer;
}
$Utfmsg .= ' ..';
$Layers = "@{[PerlIO::get_layers($fh)]}";
print "@{[$Utfmsg,$Layers]}\n" if $parm{'verbose'};
return wantarray ? ($Res,$Utfmsg,$Layers) : "@{[$Utfmsg,$Layers]}";
}
sub test_surrogate_pair
{
my $test = shift;
print "\nTesting surrogate pairs\n",'-'x20,"\n";
if (ord($test) < 0x10000) {
print "nothing to test < 0x10000\n";
} else {
my $hi = (ord($test) - 0x10000) / 0x400 + 0xD800;
my $lo = (ord($test) - 0x10000) % 0x400 + 0xDC00;
my $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
printf "test uni = %x\n", ord($test);
printf "hi surrogate = %x\n", $hi;
printf "lo surrogate = %x\n", $lo;
printf "uni = %x\n", $uni;
}
}
__END__
C:\temp>perl bomtest.pl
Writing sample data as 'UTF-16' to dummy.txt
--------------------
Reading dummy.txt as 'UTF-16'
--------------------
UTF Check, already have encoding layer .. unix crlf encoding(UTF-16) utf8
Get_BOM() - Layers = unix crlf encoding(UTF-16) utf8- char feff , bytes = 2
- char 1 , bytes = 2
- char 2 , bytes = 2
- char 21000 , bytes = 4
- Read 4 characters. Position = 10
- Sample (4) = feff 01 02 21000BOMS avail = ( fffe0000 ) ( 0000feff ) ( efbbbf ) ( fffe ) ( feff )
- Bom octets from sample (4) = fe ff 00 01
Get_BOM returned UTF-16BE, ok, offset = 2
Pass 1: seek 2/2 ok, data (3) = 1 2 21000
Pass 2: seek 2/2 ok, data (3) = 1 2 21000
Pass 3: seek 2/2 ok, data (3) = 1 2 21000
Reading dummy.txt as bytes
--------------------
Pass 1: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
Pass 2: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
Pass 3: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
C:\temp>
------------------------------------------------------
I don't guess they fixed this, and probably won't.
The failure of Encode to seek past the BOM on subsequent reads
then SEEK_SET 0 isin't something unique to Perl. Python has the same bug.
Here is a way to do it by analysing the BOM in the character semantics
of the encoding layer on the open handle without touching any layers
with binmode. The result is a conversion of a BOM into byte string
that can be compared to the octets of the Unicode family of encodings.
These octets of all the encoded BOM's are then simply read as keys to
a hash.
BOM's are unique in thier encoding's and file position so there can
be no ambiguity when analysing them in any particular read encoding
character semantic. Whatever the encoding, enough characters are
read (1 at a time, up to the Max_Bom_Width octet count) to convert
its ordinal value into the first Max_Bom_Width bytes of the characters.
The width is gleaned withe each read from a file position differential.
The bytes from each character ordinal are taken off in a loop with
a bitwise & 0xff then a shift >>= 8 on the ordinal until its zero.
0x0 is padded until the byte count equals the width.
Get the next character, then do it again until the total bytes
in the conversion adds up to Max_Bom_Width bytes (octets).
I threw this together from another project where I needed to do
this. I didn't really want to have to do this but was forced to
because of other requirements.
For better or worse, here it is. It works.
I thought I would throw it up here incase anybody else needs this
solution besides me.
-sln
--------------------------
# File: bomtest.pl
#
# CheckUTF() -
# UTF-8/16/32 check, Will add a guessed utf-16/32 layer
# if there is no current encoding() layer.
# Opened filehandle should be in byte context.
# This is more for automatically adding a utf layer on
# unknown files opened in byte mode.
# Only guesses utf-8/16/32 encodings, BOMb'd or not.
# Used standalone or optionally with Get_BOM().
#
# Get_BOM() -
# SEEK_SET workaround for BOMb'd Encoding(utf-) layer Bug
# and a general way to get the BOM and offset for subsequent
# seeks.
# Fixes SEEK_SET if need to itterate reading of BOMb'd file.
# Once called, subsequent seek calls are positionalble past
# the BOM from the offset returned. Only needs to be called
# once and preferably after a call to optional CheckUTF().
# Does not alter the PerlIO layers in any way.
# The BOM is determined with character semantics
# (that was the hard part!), a file opened in byte semantics
# is just more gravy.
# This will also detect a mismatch in width between the
# detected BOM and the current opened encoding layer.
# Used standalone or optionally with CheckUTF().
#
# test_surrogate_pair() -
# Just a utility function to print hi/lo surrogate pairs
# given a unicode test character (> 0x10000).
#
#
# Modify the top variables to test various configurations.
# It basically creates a test file in an encoding, reads it
# back in that encoding. Then reads it back in byte mode.
# The write flag can be set so that just reading occurs.
#
#
# -sln, 8/4/09
# --------------------------------------------------------
use strict;
use warnings;
binmode (STDOUT, ':utf8');
my $fname = 'dummy.txt';
my $Unichar = "\x{21000}";
my $data = "\x{1}\x{2}$Unichar";
my $enc = 'UTF-16'; # blank or UTF-xxLE/BE or :bytes, :utf8, :raw, etc ...
my $enc_string = ($enc =~ /:/) ? "$enc" : (length($enc) ? "encoding($enc)" : '');
my $write_file = 1;
my $test_surr = 0;
my ($fh, $line);
my ($width,$bom,$offset);
# Test surrogate pairs ..
#
test_surrogate_pair($Unichar) if $test_surr;
# Create a test file with encoding ..
#
if ($write_file) {
print "\nWriting sample data as '$enc' to $fname\n",'-'x20,"\n";
open $fh, ">$enc_string", $fname or die "can't open $fname for write $!";
print $fh $data;
close $fh;
}
# Read test file with encoding ..
#
print "\nReading $fname as '$enc'\n",'-'x20,"\n";
open $fh, "<$enc_string", $fname or die "can't open $fname for read $!";
CheckUTF($fh,'size'=> 100, 'verbose'=> 1);
($width,$bom,$offset) = Get_BOM( $fh, 'v' ); # list context, verbose prints
# $offset = Get_BOM( $fh ); # scalar context, no prints
print "\nGet_BOM returned $bom, $width, offset = $offset\n";
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ";
for (map {ord $_} split //, $line) {
printf ("%x ",$_);
}
print "\n";
}
close $fh;
# Read test file in byte mode ..
#
print "\nReading $fname as bytes\n",'-'x20,"\n";
open $fh, '<', $fname or die "can't open $fname for read $!";
$offset = 0;
for (1 .. 3) {
print "Pass $_: ";
seek($fh, $offset, 0); # SEEK_SET
my $fpos = tell($fh);
my $ok = ($fpos == $offset) ? 'ok' : 'bad';
$line = <$fh>;
print "seek $offset/$fpos $ok, data (".length($line).") = ";
for (map {ord $_} split //, $line) {
printf ("%x ",$_);
}
print "\n";
}
close $fh;
exit 0;
##
## End of program ...
##
sub Get_BOM
{
my ($fh, $verbose) = @_;
$verbose = 0 unless (defined $verbose and $verbose);
use Encode;
my %bom2enc = (
map { encode($_, "\x{feff}") => $_ } qw(
UTF-8
UTF-16BE
UTF-16LE
UTF-32BE
UTF-32LE
)
);
my %enc2bom = (
reverse(%bom2enc),
map { $_ => encode($_, "\x{feff}") } qw(
UCS-2
iso-10646-1
utf8
)
);
my @bombs = sort { length $b <=> length $a } keys %bom2enc;
my $MAX_BOM_LENGTH = length $bombs[0];
my $bomstr = join '|', @bombs;
my $bom_re = qr/^($bomstr)/o;
my $layers = "@{[PerlIO::get_layers($fh)]}";
print "\nGet_BOM() - Layers = $layers\n" if $verbose;
if (defined($fh) && (ref($fh) eq 'GLOB' || ref(\$fh) eq 'GLOB'))
{
# Note - Must SEEK_SET/read twice to
# mitigate UTF-16/32 BOM seek bug
# -----------------------------------
if (seek($fh, 0, 0))
{
my $sample = '';
read ($fh, $sample, 1);
seek($fh, 0, 0); # SEEK_SET
my $fpos = tell($fh);
my @samparray = ();
## Read in $MAX_BOM_LENGTH 'characters'
## in whatever encoding the file was opened with.
## Store each 'character' and bytes/char - based
## on the differential file position per read.
## A BOM can be sucessfully read un-alterred in
## 1,2,4 or 8 byte sequences.
## -------------------------------------------
for (1 .. $MAX_BOM_LENGTH)
{
my $count = read ($fh, my $buff, 1);
last if (!defined $count or $count != 1);
push @samparray, {'char'=> $buff, 'width'=> (tell($fh)-$fpos)};
$fpos = tell($fh);
}
$sample = '';
print ">> Reading up to $MAX_BOM_LENGTH characters\n" if $verbose;
for my $aref (@samparray) {
$sample .= $aref->{'char'};
printf (" - char %x , bytes = %d\n", ord($aref->{'char'}), $aref->{'width'}) if $verbose;
}
seek($fh, 0, 0); # SEEK_SET, last seek, set to position 0
if ($verbose) {
print " - Read ".length($sample)." characters. Position = $fpos\n";
print " - Sample (".length($sample).") = ";
for (map {ord $_} split //, $sample) {
printf ("%02x ", $_);
}
print "\n>> Converting to bytes\n";
}
## Convert the 'characters' to bytes.
## Only process $MAX_BOM_LENGTH bytes (octets).
## We only care about the BOM, which will match
## the encoding bytes we already have in %bom2enc.
## -----------------------------------------------
my ($octets, $offset) = ('',0);
my $numb_octets = 0;
for my $aref (@samparray)
{
my @ar = ();
my ($width, $ordchar) = (
$aref->{'width'},
ord( $aref->{'char'})
);
last if (($numb_octets + $width) > $MAX_BOM_LENGTH);
$numb_octets += $width;
while ($ordchar > 0) {
push @ar, $ordchar & 0xff;
$ordchar >>= 8;
--$width;
}
push (@ar,0) while ($width-- > 0);
for (reverse @ar) {
vec ($octets, $offset++, 8) = $_;
}
}
if ($verbose) {
print " BOMS avail = ";
for my $bom (@bombs) {
print '( ';
for (map {ord $_} split //, $bom) {
printf ("%02x",$_);
}
print ' ) ';
}
print "\n - Bom octets from sample (".length($octets).") = ";
for (map {ord $_} split //, $octets) {
printf ("%02x ", $_);
}
print "\n";
}
# end convert
if (my ($found) = $octets =~ /$bom_re/) {
my ($offset, $bomstring) = (
length($found),
$bom2enc{$found}
);
if ($verbose) {
print ">> Found $bomstring, BOM = ";
for (map {ord $_} split //, $found) {
printf ("%02x", $_);
}
print "\n";
}
my $noendian = $bomstring;
$noendian =~ s/(?:LE|BE)$//i;
my $width = ($layers =~ /$noendian/i) ? 'ok' : 'mismatch';
print " - Does not match width of current encoding layer\n" if ($verbose and $width ne 'ok');
return ($width, $bomstring,$offset);
}
print ">> BOM Not found\n" if $verbose;
return ('','',0);
}
# can't seek here, fall through
}
# There was an error ..
print "\nGet_BOM() - Layers = $layers\n" if (!$verbose);
print ">> Invalid filehandle or file is unseekable\n - Utf BOMs available (max length = $MAX_BOM_LENGTH):\n";
while (my ($key,$val) = each %enc2bom)
{
my $valstring = '';
for (split //, $val) {
$valstring .= sprintf ("%x ",ord $_);
}
print " $key = $bom2enc{$val} = $valstring\n";
#print " $key = $bom2enc{$val} = '$val' = '$enc2bom{$key}' = $valstring\n";
}
return ('','',-1);
}
sub CheckUTF
{
# UTF-8/16/32 check
# Will add a layer if there is no current
# encoding() layer. So open filehandle should
# be in byte context.
# ----------------------------------------
# Result value:
# -1 - not filehandle (or undef'ed)
# 0 - read error or empty file
# 1 - already have encoding layer (could force change later)
# 2 - guess_encoder() error message
# 3 - do nothing unless utf-16/32(be/le)
# 4 - added a layer
my ($fh,@args) = @_;
my %parm = ('verbose'=> 0, 'size'=> 60);
while (my ($name, $val) = splice (@args, 0, 2)) {
$name =~ s/^\s+|\s+$//; next if not defined $val;
if (lc($name) eq 'verbose' || (lc($name) eq 'size' && $val > 4))
{ $parm{$name} = $val; }
}
my ($Res, $Utfmsg, $Layers) = (0, 'UTF Check', '');
if (!defined($fh) || (ref($fh) ne 'GLOB' && ref(\$fh) ne 'GLOB')) {
$Utfmsg .= ", not a filehandle ..";
print "$Utfmsg\n" if $parm{'verbose'};
return wantarray ? ($Res,$Utfmsg,'') : $Utfmsg;
}
$Layers = ':'.join (':', PerlIO::get_layers($fh)).':';
if ($Layers =~ /:encoding/) {
$Utfmsg .= ", already have encoding layer";
$Res = 1;
} else {
my ($count, $sample);
my $utf8layer = $Layers =~ /:utf8/;
binmode ($fh,":bytes");
seek ($fh, 0, 0);
# Try to read a large sample
if (!defined($count = read ($fh,$sample,$parm{'size'},0))) {
$Utfmsg .= ". $!"; # read error
} elsif ($count <= 0) {
$Utfmsg .= ". File is empty";
} else {
seek ($fh, 0, 0); # SEEK_SET
use Encode::Guess;
my $decoder = guess_encoding ($sample); # ascii/utf8/BOMed UTF
$Res = 2;
if (ref($decoder)) {
my $name = $decoder->name;
$decoder = 'Do nothing';
$Res = 3;
$Utfmsg .= ", guess($count read): $name";
if ($name =~ /UTF.*?(?:16|32)/i) {
# $name =~ s/(?:LE|BE)$//i;
$decoder = "Adding layer";
$Res = 4;
binmode ($fh, ":encoding($name)");
}
}
$Utfmsg .= ". $decoder" if (defined $decoder); # guess error
}
binmode ($fh,":utf8") if $utf8layer;
}
$Utfmsg .= ' ..';
$Layers = "@{[PerlIO::get_layers($fh)]}";
print "@{[$Utfmsg,$Layers]}\n" if $parm{'verbose'};
return wantarray ? ($Res,$Utfmsg,$Layers) : "@{[$Utfmsg,$Layers]}";
}
sub test_surrogate_pair
{
my $test = shift;
print "\nTesting surrogate pairs\n",'-'x20,"\n";
if (ord($test) < 0x10000) {
print "nothing to test < 0x10000\n";
} else {
my $hi = (ord($test) - 0x10000) / 0x400 + 0xD800;
my $lo = (ord($test) - 0x10000) % 0x400 + 0xDC00;
my $uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
printf "test uni = %x\n", ord($test);
printf "hi surrogate = %x\n", $hi;
printf "lo surrogate = %x\n", $lo;
printf "uni = %x\n", $uni;
}
}
__END__
C:\temp>perl bomtest.pl
Writing sample data as 'UTF-16' to dummy.txt
--------------------
Reading dummy.txt as 'UTF-16'
--------------------
UTF Check, already have encoding layer .. unix crlf encoding(UTF-16) utf8
Get_BOM() - Layers = unix crlf encoding(UTF-16) utf8- char feff , bytes = 2
- char 1 , bytes = 2
- char 2 , bytes = 2
- char 21000 , bytes = 4
- Read 4 characters. Position = 10
- Sample (4) = feff 01 02 21000BOMS avail = ( fffe0000 ) ( 0000feff ) ( efbbbf ) ( fffe ) ( feff )
- Bom octets from sample (4) = fe ff 00 01
Get_BOM returned UTF-16BE, ok, offset = 2
Pass 1: seek 2/2 ok, data (3) = 1 2 21000
Pass 2: seek 2/2 ok, data (3) = 1 2 21000
Pass 3: seek 2/2 ok, data (3) = 1 2 21000
Reading dummy.txt as bytes
--------------------
Pass 1: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
Pass 2: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
Pass 3: seek 0/0 ok, data (10) = fe ff 0 1 0 2 d8 44 dc 0
C:\temp>