I am trying to improve the warning message issued by Encode::decode()
. Instead of printing the name of the module and the line number in the module, I would like it to print the name of the file being read and the line number in that file where the malformed data was found. To a developer, the origial message can be useful, but to an end user not familiar with Perl, it is probably quite meaningless. The end user would probably rather like to know which file is giving the problem.
I first tried to solve this using a $SIG{__WARN__}
handler (which is probably not a good idea), but I get a segfault. Probably a silly mistake, but I could not figure it out:
#! /usr/bin/env perl
use feature qw(say);
use strict;
use warnings;
use Encode ();
binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
my $fn = 'test.txt';
write_test_file( $fn );
# Try to improve the Encode::FB_WARN fallback warning message :
#
# utf8 "\xE5" does not map to Unicode at <module_name> line xx
#
# Rather we would like the warning to print the filename and the line number:
#
# utf8 "\xE5" does not map to Unicode at line xx of file <filename>.
my $str = '';
open ( my $fh, "<:encoding(utf-8)", $fn ) or die "Could not open file '$fn': $!";
{
local $SIG{__WARN__} = sub { my_warn_handler( $fn, $_[0] ) };
$str = do { local $/; <$fh> };
}
close $fh;
say "Read string: '$str'";
sub my_warn_handler {
my ( $fn, $msg ) = @_;
if ( $msg =~ /\Qdoes not map to Unicode\E/ ) {
recover_line_number_and_char_pos( $fn, $msg );
}
else {
warn $msg;
}
}
sub recover_line_number_and_char_pos {
my ( $fn, $err_msg ) = @_;
chomp $err_msg;
$err_msg =~ s/(line \d+)\.$/$1/; # Remove period at end of sentence.
open ( $fh, "<:raw", $fn ) or die "Could not open file '$fn': $!";
my $raw_data = do { local $/; <$fh> };
close $fh;
my $str = Encode::decode( 'utf-8', $raw_data, Encode::FB_QUIET );
my ($header, $last_line) = $str =~ /^(.*\n)([^\n]*)$/s;
my $line_no = $str =~ tr/\n//;
++$line_no;
my $pos = ( length $last_line ) + 1;
warn "$err_msg, in file '$fn' (line: $line_no, pos: $pos)\n";
}
sub write_test_file {
my ( $fn ) = @_;
my $bytes = "Hello\nA\x{E5}\x{61}"; # 2 lines ending in iso 8859-1: åa
open ( my $fh, '>:raw', $fn ) or die "Could not open file '$fn': $!";
print $fh $bytes;
close $fh;
}
Output:
utf8 "\xE5" does not map to Unicode at ./p.pl line 27
, in file 'test.txt' (line: 2, pos: 2)
Segmentation fault (core dumped)