2

I'm trying to get a whole bunch of values from around 3,000 HTML files and save them to a spreadsheet.

I'm using HTML::TreeBuilder to process the HTML and creating a spreadsheet using Spreadsheet::WriteExcel.

But my script doesn't successfully get the values. I see

Use of uninitialized value $val in concatenation (.) or string at spreadsheet.pl line 63.

What might I be doing wrong?

Here's an example of my HTML files on pastebin.com. It was too large to post in the question.

My Perl code

use warnings 'all';
use strict;

use LWP::Simple 'get';
use Spreadsheet::WriteExcel;
use HTML::TreeBuilder;
use Path::Tiny;
use constant URL => 'http://pastebin.com/raw/qLwu80ZW';

my $teamNumber  = "";
my $teamName    = "";
my $schoolName  = "";
my $area        = "";
my $district    = "";
my $agDeptPhone = "";
my $schoolPhone = "";
my $fax         = "";
my $addressOne  = "";
my $addressTwo  = "";
my $city        = ""; 
my $state       = "";
my $zipCode     = "";
my $name        = "";
my $email       = "";
my $row         = "";
my $Ypos        = 0; 

my $path = "Z:\\_WEB_CLIENTS\\Morgan Livestock\\Judging Card";

my $workbook  = Spreadsheet::WriteExcel->new('perlOutput.xlsx');
my $worksheet = $workbook->add_worksheet();


sub getTeamNumber {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_TeamNumber/)->attr('value');
    }

    print "Got Team Number $val\n";

    return $val;
}


sub getTeamName {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_TeamName/)->attr('value');
    }

    print "Got Team Name $val\n";

    return $val;
}


sub getSchoolName {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(tag_ => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_SchoolName/)->attr('value');
    }

    print "Got School Name $val\n";

    return $val;
}


sub getArea{
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(tag_ => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Area/)->attr('value');
    }

    print "Got Area $val\n";

    return $val;
}


sub getDistrict{
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_District/)->attr('value');
    }

    print "Got District $val\n";

    return $val;
}


sub getDeptPhone {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Phone/)->attr('value');
    }

    print "Got Dept Phone $val\n";

    return $val;
}


sub getSchoolPhone{
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Phone2/)->attr('value');
    }

    print "Got School Phone $val\n";

    return $val;
}


sub getFax{
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Fax/)->attr('value');
    }

    print "Got Fax $val\n";

    return $val;
}


sub getAddress1 {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Address1/)->attr('value');
    }

    print "Got Address One $val\n";

    return $val;
}


sub getAddress2 {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Address2/)->attr('value');
    }

    print "Got Address Two $val\n";

    return $val;
}


sub getCity {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_City/)->attr('value');
    }

    print "Got Address Two $val\n";

    return $val;
}


sub getState {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_State/)->attr('value');
    }

    print "Got State $val\n";

    return $val;
}


sub getZip {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Zip/)->attr('value');
    }

    print "Got Zip $val\n";

    return $val;
}


sub getWebsite {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $val;

    foreach my $node (@nodes) {
        $val = $node->look_down('name', qr/\$txt_Website/)->attr('value');
    }

    print "Got Website $val\n";

    return $val;
}


sub getNameAndEmail {
    my ($file) = @_;

    my $tree = HTML::TreeBuilder->new_from_content(get URL);
    my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');

    for my $tr ( $table->look_down(_tag => 'tr') ) {
        next unless my @td = $tr->look_down(_tag => 'td');
        my ($name, $email) = map { $_->as_trimmed_text } @td[0,1];
    }

    print "Got Name and Email $name and $email\n";

    return ($name, $email);
}

# FILLER: This fills the spreadsheet with all the variables we've acquired

sub fill {
    my ($name, $email, $teamNumber, $teamName, $schoolName,
        $area, $district, $agDeptPhone, $schoolPhone,
        $fax, $addressOne, $addressTwo, $city, $state, $zipCode) = (@_);

    $worksheet->write($Ypos, 1, $name);
    $worksheet->write($Ypos, 2, $email);
    $worksheet->write($Ypos, 3, $teamNumber);
    $worksheet->write($Ypos, 4, $teamName);
    $worksheet->write($Ypos, 5, $schoolName);
    $worksheet->write($Ypos, 6, $area);
    $worksheet->write($Ypos, 7, $district);
    $worksheet->write($Ypos, 8, $agDeptPhone);
    $worksheet->write($Ypos, 9, $schoolPhone);
    $worksheet->write($Ypos, 10, $fax);
    $worksheet->write($Ypos, 11, $addressOne);
    $worksheet->write($Ypos, 12, $addressTwo);
    $worksheet->write($Ypos, 13, $city);
    $worksheet->write($Ypos, 14, $state);
    $worksheet->write($Ypos, 15, $zipCode);
}

# Open judgingcard directory

opendir (DIR, $path) or die "Unable to open directory 'Judging Card': $!";

my @files = readdir(DIR);

# This fills out all top row info

$worksheet->write("A1", "Name");
$worksheet->write("B1", "Email");
$worksheet->write("C1", "Team Number");
$worksheet->write("D1", "Team Name");
$worksheet->write("E1", "School Name");
$worksheet->write("F1", "Area");
$worksheet->write("G1", "District");
$worksheet->write("H1", "Ag Dept Phone");
$worksheet->write("I1", "School Phone");
$worksheet->write("J1", "Fax");
$worksheet->write("K1", "Address One");
$worksheet->write("L1", "Address Two");
$worksheet->write("M1", "City");
$worksheet->write("N1", "State");
$worksheet->write("O1", "Zip Code");

###################################

foreach my $file (@files) { # run through all files in directory

    next if (-d $file); # Skip file if file is folder

    $Ypos = $Ypos + 1;

    my ($name1, $email1) = getNameAndEmail($file);

    $name        = $name1;
    $email       = $email1;
    $teamNumber  = getTeamNumber($file);
    $teamName    = getTeamName($file);
    $schoolName  = getSchoolName($file);
    $area        = getArea($file);
    $district    = getDistrict($file);
    $agDeptPhone = getDeptPhone($file);
    $schoolPhone = getSchoolPhone($file);
    $fax         = getFax($file);
    $addressOne  = getAddress1($file);
    $addressTwo  = getAddress2($file);
    $city        = getCity($file);
    $state       = getState($file);
    $zipCode     = getZip($file);

    fill($name, $email, $teamNumber, $teamName, $schoolName,
        $area, $district, $agDeptPhone, $schoolPhone, $fax,
        $addressOne, $addressTwo, $city, $state, $zipCode);

    print "Progressing                    $file                ($Ypos)\n"
}

closedir(DIR);


sub getTeamNumber {
    my ($file) = @_;

    my $html = path($file);
    my $tree = HTML::TreeBuilder->new_from_content($html);
    my @nodes = $tree->look_down(_tag => 'input');

    my $name;
    my $val;

    foreach my $node (@nodes) {
        $name = $node->look_down('name', qr/\$txt_TeamNumber/);
    }

    if ( ! defined $name ) {
        print "Couldn't get team number\n";
    }

    if ( $name ) {
        $val = $name->attr('value');
        print "Got Team number $val\n";
    }

    return $val;
}

New script:

use LWP::Simple 'get';
use Spreadsheet::WriteExcel;
use HTML::TreeBuilder;
use Path::Tiny;

my $path = "Z:\\_WEB_CLIENTS\\Morgan Livestock\\Judging Card";

my $workbook  = Spreadsheet::WriteExcel->new('perlOutput.xlsx');
my $worksheet = $workbook->add_worksheet();

opendir (DIR, $path) or die "Unable to open directory 'Judging Card': $!";
my @files = readdir(DIR);

# Specify spreadsheet headers in desired order and write  to file
my @headers = ('Name', 'Email', 'Team Number', 'Team Name', 'School Name', 'Area', 'District', 'Ag Dept Phone', 'School Phone', 'Fax', 'Address One'
    , 'Address Two', 'City', 'State', 'Zip Code');
$worksheet->write_row(0, 0, \@headers);               # first row

# Build ancillary data structures to later sort results by this order
# each header with its index from @headers (specifies columns' order)
my %ho = map { state $idx; $_ => ++$idx } @headers;
# each name (`TeamNumber` ...) with the index of its header
my %name_order = ( Name => $ho{Name}, Email => $ho{Email}, 
    TeamNumber => $ho{'Team Number'}, TeamName => $ho{'Team Name'}, SchoolName => $ho{'School Name'}, Area => $ho{'Area'}, District => $ho{'District'}, 
        AgDeptPhone => $ho{'Ag Dept Phone'}, SchoolPhone => $ho{'School Phone'}, Fax => $ho{'Fax'}, AddressOne => $ho{'Address One'},
        AddressTwo => $ho{'Address Two'}, City => $ho{'City'}, State => $ho{'State'}, Zip => $ho{'Zip Code'});
        
sub getNames {
    my ($file) = @_;
    my $tree = HTML::TreeBuilder->new_from_content( path($file) );
    my @nodes = $tree->look_down(_tag => 'input');

    # List phrases to find, and build hash with their derived names
    # Should probably be defined globally, once for the whole program
    my @patterns  = map { '$txt_' . $_ } 
        qw(TeamName TeamNumber SchoolName Area District 
           Phone Phone2 Fax Address1 Address2 City State Zip Website);

    # Name for each pattern: everything after first _ (so after $txt_) 
    my %patt_name = map { $_ => (/[^_]+_(.*)/)[0] } @patterns;
    my %name_val;
    foreach my $node (@nodes) {
        foreach my $patt (@patterns) {
            my $name = $node->look_down('name', qr/\Q$patt/);
            if ($name) {
                $name_val{$patt_name{$patt}} = $name->attr('value') || '';
            }
        }
    }

    # Name and Email are stored differently. Fetch those now
    my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
    for my $tr ( $table->look_down(_tag => 'tr') ) { 
        next unless my @td = $tr->look_down(_tag => 'td');
        # Discard incomplete Name-Email records -- either both or none
        @name_val{qw(Name Email)} = 
            map { (defined) ? $_->as_trimmed_text : '' } @td[0,1];
    }
    return \%name_val;
}

sub fill_row {
   my ($ws, $row, $rdata, $rorder) = @_;    
   my %name_val   = %$rdata;
   my %name_order = %$rorder;

   my @vals = map { $name_val{$_} } 
              sort { $name_order{$a} <=> $name_order{$b} } 
              keys %name_val;

   $ws->write_row($row, 0, \@vals);  # add check (returns 0 on success)

   return 1;

    my $row = 1;
}

foreach my $file (@files) {
    next if -d $file;

    my %name_val = %{ getNames($file) };

    foreach my $name (sort keys %name_val) {
        # Fill the spreadsheet with all info in one go
        if ($name_val{$name}) {
            print "$name => $name_val{$name}\n";
        } else {
            print "Not found $name in $file\n";
        }
    }

    
    my %name_val = %{ getNames($file) };
    fill_row($worksheet, $row++, \%name_val, \%name_order);

    foreach my $name (sort keys %name_val) {  # demo
        if ($name_val{$name}) { print "$name => $name_val{$name}\n" }
        else                  { print "Not found $name in $file\n" }
    }
    
    print "Progressing $Ypos \n"
}

                                                                                                  
Community
  • 1
  • 1
  • 2
    Please read [mcve]. – melpomene Apr 03 '17 at 20:14
  • @zdim Thank you for your answer! Based on it, I updated one of my subfunctions (Which I added in the thread) and tried to run it, and got this: `Couldn't get team number` Any idea why It can't get it? – Ultracrepidarian Apr 03 '17 at 20:16
  • Note: I realized I made a mistake in doing `!defined` instead of `defined`, so I changed that, and instead of getting the error message, I got no message at all. – Ultracrepidarian Apr 03 '17 at 20:23
  • Yourr `defined` test needs to be inside of the loop. Also, provide `else` where you print a message if the thing isn't found. See my answer, now posted. Let me know if it needs explanations. – zdim Apr 03 '17 at 20:39
  • You don't encourage people to answer your questions. There are so many headers and irrelevant narrative that the information gets lost, and you don't bother to format your code so that it's readable. I've said this to you twice before, but all you seem to take away is the Perl code that does what you ask, and ignore any other advice. I've edited your question again, but please make an effort to write something more like this next time. – Borodin Apr 03 '17 at 22:19
  • 1
    I have to say that your code is very extended and wasteful. You have fifteen or so values that you want to retrieve, and you've coded a separate variable and subroutine for every one of them. What's more, each of those subroutines opens the HTML file, reads it into memory, parses it, extracts one of the values, and then throws all of that away to be repeated by the next subroutine. This happens 45,000 times (15 x 3,000) and must be horribly slow. – Borodin Apr 03 '17 at 22:29

2 Answers2

2

You probably could reduce the code. Even if in the following example me not using HTML::TreeBuidler, the approach is similar. Using Mojo::DOM58,

use 5.014;
use warnings;

use Mojo::DOM58;
use Path::Tiny;
use Data::Dumper;

my @fields = qw( TeamName TeamNumber SchoolName Area District Phone Phone2 Fax Address1 Address2 City State Zip Website );

my $html = path('team.html')->slurp;
my $dom = Mojo::DOM58->new($html);

my $data;
for my $field( @fields ) {
    $data->{$field} = $dom->at(qq{input[name*="txt_$field"]})->attr('value') // "";
}

say Dumper $data;

prints:

$VAR1 = {
          'TeamName' => 'Ruidoso',
          'Zip' => '',
          'State' => 'NM',
          'City' => '',
          'District' => '1',
          'Phone2' => '',
          'Area' => '1',
          'SchoolName' => '',
          'Address2' => '',
          'Website' => '',
          'Address1' => '',
          'Phone' => '',
          'Fax' => '',
          'TeamNumber' => '83'
        };
clt60
  • 62,119
  • 17
  • 107
  • 194
1

In short, some of those 'name' are probably just not found in (some of) HTML files. So test first to see whether it is there, then write to $val or print message about it not being found.

The most apparent thing to be improved: there is no need for separate functions. You can search and find all of them in one call, and store them in the hash name => value, which is returned.

sub getNames {
    my ($file) = @_;
    my $tree = HTML::TreeBuilder->new_from_content( path($file) );
    my @nodes = $tree->look_down(_tag => 'input');

    # List phrases to find, and build hash with their derived names
    # Should probably be defined globally, once for the whole program
    my @patterns  = map { '$txt_' . $_ } 
        qw(TeamName TeamNumber SchoolName Area District 
           Phone Phone2 Fax Address1 Address2 City State Zip Website);

    # Name for each pattern: everything after first _ (so after $txt_) 
    my %patt_name = map { $_ => (/[^_]+_(.*)/)[0] } @patterns;
    my %name_val;
    foreach my $node (@nodes) {
        foreach my $patt (@patterns) {
            my $name = $node->look_down('name', qr/\Q$patt/);
            if ($name) {
                $name_val{$patt_name{$patt}} = $name->attr('value') || '';
            }
        }
    }

    # Name and Email are stored differently. Fetch those now
    my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
    for my $tr ( $table->look_down(_tag => 'tr') ) { 
        next unless my @td = $tr->look_down(_tag => 'td');
        # Discard incomplete Name-Email records -- either both or none
        if (2 == grep { not ref $_ } @td) {
            @name_val{qw(Name Email)} = map { $_->as_trimmed_text } @td[0,1];
        }
        else { @name_val{qw(Name Email)} = ('', '') }
    }
    return \%name_val;
}

For Name and Email we require both to be there as text, or both are thrown out. (The sample source has There are no people ... inside a div for Name, and nothing for Email.)

To get whatever is there, instead of if-else above use

@name_val{qw(Name Email)} = 
    map { (defined) ? $_->as_trimmed_text : '' } @td[0,1];

and we get the note quoted above for Name and an empty string for Email, with this sample.

Then

# Specify spreadsheet headers in desired order and write  to file
my @headers = ('Name', 'Email', 'Team Number', 'Team Name', ...);
$worksheet->write_row(0, 0, \@headers);               # first row

# Build ancillary data structures to later sort results by this order
# each header with its index from @headers (specifies columns' order)
my %ho = map { state $idx; $_ => ++$idx } @headers;
# each name (`TeamNumber` ...) with the index of its header
my %name_order = ( Name => $ho{Name}, Email => $ho{Email}, 
    TeamNumber => $ho{'Team Number'}, TeamName => $ho{'Team Name'}, ... 
);

my $row = 1;
foreach my $file (@files) {
    next if -d $file;

    my %name_val = %{ getNames($file) };
    fill_row($worksheet, $row++, \%name_val, \%name_order);

    foreach my $name (sort keys %name_val) {  # demo
        if ($name_val{$name}) { print "$name => $name_val{$name}\n" }
        else                  { print "Not found $name in $file\n" }
    }
}

sub fill_row {
   my ($ws, $row, $rdata, $rorder) = @_;    
   my %name_val   = %$rdata;
   my %name_order = %$rorder;

   my @vals = map { $name_val{$_} } 
              sort { $name_order{$a} <=> $name_order{$b} } 
              keys %name_val;

   $ws->write_row($row, 0, \@vals);  # add check (returns 0 on success)

   return 1;
}

The write_row takes a reference to an array and writes out a row with its elements. Note that write can be used that way, too, when an arrayref is given.

The output on the linked HTML file

Area => 1
District => 1
State => NM
TeamName => Ruidoso
TeamNumber => 83

and Not found ... for others. The .xls file is correct (when the full list of names is used).


The whole program

use warnings;
use strict;
use feature qw(say state);

use Path::Tiny;
use HTML::TreeBuilder;
use Spreadsheet::WriteExcel;


my @src = qw(TeamName TeamNumber SchoolName Area District Phone Phone2 
    Fax Address1 Address2 City State Zip Website);
my @headers = ('Name', 'Email', 'Team Number', 'Team Name', 'School Name', 
    'Area', 'District', 'Ag Dept Phone', 'School Phone', 'Fax', 'Address One', 
    'Address Two', 'City', 'State', 'Zip Code', 'Web Site'
);
my @lens = map { length } @headers;  # for printing

# Numeric order of headers' fields (so, columns)
my %ho = map { state $idx; $_ => ++$idx } @headers;
# Translation: name from HTML source => column number (retrieved from %ho)
my %name_order = ( 
    Name => $ho{Name}, Email => $ho{Email}, 
    TeamNumber => $ho{'Team Number'},
    TeamName => $ho{'Team Name'}, 
    SchoolName => $ho{'School Name'}, Area => $ho{'Area'},
    District => $ho{'District'}, Phone2 => $ho{'Ag Dept Phone'},
    Phone => $ho{'School Phone'}, Fax => $ho{'Fax'}, 
    Address1 => $ho{'Address One'},
    Address2 => $ho{'Address Two'}, 
    City => $ho{'City'}, State => $ho{'State'}, 'Zip' => $ho{'Zip Code'},
    Website => $ho{'Web Site'}
);

say "Order (column) of names from HTML source to follow headers:";
printf("%-10s  ==>  %s\n", $_, $name_order{$_})
    for sort { $name_order{$a} <=> $name_order{$b} } keys %name_order;
say '';

my $workbook  = Spreadsheet::WriteExcel->new('data.xls');
my $worksheet = $workbook->add_worksheet();

# Print headers to .xls file (and to screen)
$worksheet->write_row(0, 0, \@headers);
say "Spreadsheet, header and rows:";
prn_row(\@headers);  # print to screen

my @files = ('fetch_names.html');
my $row = 1;
foreach my $file (@files) {
    next if -d $file;

    # Parse the file, print the row to spreadsheet
    my %name_val = %{ getNames($file) };
    fill_row($worksheet, $row++, \%name_val, \%name_order);
}


# Functions

sub fill_row {
    my ($ws, $row, $rdata, $rorder) = @_;

    my %name_val = %$rdata;
    my $name_order = %$rorder;

    my @vals =
        map { $name_val{$_} }
        sort { $name_order{$a} <=> $name_order{$b} }
        grep { exists $name_order{$_} }
        keys %name_val;

    prn_row(\@vals);  # print to screen

    $worksheet->write_row($row, 0, \@vals);  # test this (returns 0 on success)

    return 1;
}

sub prn_row {
    my @ary = @{ $_[0] };
    for (0..$#ary) {
        my $len = $lens[$_];
        printf("%${len}s  ", $ary[$_]);
    }
    say '';
}

sub getNames {
    my ($file) = @_;
    my $tree = HTML::TreeBuilder->new_from_content( path($file)->slurp );
    my @nodes = $tree->look_down(_tag => 'input');

    my @patterns = map { '$txt_' . $_ } @src;
    # List phrases to find, and build hash with their derived names
    # Name for each pattern: everything first _ (so after \$txt_)
    my %patt_name = map { $_ => (/[^_]+_(.*)/)[0] } @patterns;

    my %name_val;
    foreach my $node (@nodes) {
        foreach my $patt (@patterns) {
            my $name = $node->look_down('name', qr/\Q$patt/) or next;
            $name_val{$patt_name{$patt}} = $name->attr('value') // '';
        }
    }

    # Name and Email are stored differently, fetch those now
    my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
    for my $tr ( $table->look_down(_tag => 'tr') ) {
        next unless my @td = $tr->look_down(_tag => 'td');
        # Discard incomplete Name-Email records -- either both or none
        if (2 == grep { not ref } @td) {
            @name_val{qw(Name Email)} = map { $_->as_trimmed_text } @td[0,1];
        }
        else { @name_val{qw(Name Email)} = ('', '') }
    }
    return \%name_val;
}

This works as a complete program with the supplied sample HTML source.


Addition: an actual page may have multiple name-email pairs

use LWP::Simple qw(get);

sub getNames {
    my ($file) = @_; 
    my $url = 'https://www.judgingcard.com/Directory/Directory.aspx?ID=1643';
    my $page = get($url) or die "Can't get the page $url: $!";
    my $tree = HTML::TreeBuilder->new_from_content( $page );

    my @nodes = $tree->look_down(_tag => 'input');

    my @patterns = map { '$txt_' . $_ } @src;
    # List phrases to find, and build hash with their derived names
    # Name for each pattern: everything first _ (so after \$txt_)
    my %patt_name = map { $_ => (/[^_]+_(.*)/)[0] } @patterns;

    my %name_val;
    foreach my $node (@nodes) {
        foreach my $patt (@patterns) {
            my $name = $node->look_down('name', qr/\Q$patt/) or next;
            $name_val{$patt_name{$patt}} = $name->attr('value') // ''; 
        }   
    }   

    # Name and Email are stored differently, fetch those now
    my %name_email;
    my ($table) = $tree->look_down(_tag => 'table', class => 'rgMasterTable');
    for my $tr ( $table->look_down(_tag => 'tr') ) { 
        next unless my @td = $tr->look_down(_tag => 'td');
        # There may be more than one Name-Email pair
        # so enter key-value pair explicitely
        if (2 <= grep { ref } @td) {
            $name_email{$td[0]->as_trimmed_text} = $td[1]->as_trimmed_text;
        }
        else { %name_email = ('', '') }
    }
    return \%name_val, \%name_email;
}

Then in the main you need

foreach my $file (@files) {
    next if -d $file;

    # Parse the file, unpack name-value and name-email hashes
    my ($rname_val, $rname_email) = getNames($file);
    my %name_val   = %$rname_val;
    my %name_email = %$rname_email;

    # Print a row for each Name-Email, adding them to %name_val
    foreach my $name (keys %name_email) {
        $name_val{Name}  = $name;
        $name_val{Email} = $name_email{$name};
        fill_row($worksheet, $row++, \%name_val, \%name_order);
    }
}

The desired format with mutpliple Name-Email pairs is: same headers, and for each pair a separate row is printed to file, where all information other than Name-Email is the same.

The printed spreadsheet (URL used was provided in comments)

Printed spreadsheet. URL was provided in comments

zdim
  • 64,580
  • 5
  • 52
  • 81
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/141741/discussion-between-ultracrepidarian-and-zdim). – Ultracrepidarian Apr 14 '17 at 19:44
  • @Ultracrepidarian I edited my last addition (last "section", below the horizontal line), so now it prints a spreadsheet per your template. I'll read over and edit a little more, perhaps – zdim Apr 24 '17 at 17:24
  • @Ultracrepidarian You have the "Complete program" from before. Now, what is shown in the last section replaces a couple of parts: the `getNames()` sub (where really just a few lines are changed), and the `foreach my $file (@files) { }` loop in the main. Note that in `getNames()` I fetch the page from internet, you may need to adjust that. Also, I used here `LWP::Simple` and if you keep that then you need `use LWP::Simple;` at the top. I am trying here to crop the image of the result spreadsheet but something's messing with me ... it's exactly your template from docs.google – zdim Apr 24 '17 at 17:32
  • @Ultracrepidarian I added the image of the printed spreadsheet, at the end – zdim Apr 24 '17 at 17:39
  • @Ultracrepidarian This had come up already -- use the program from "_The whole program_" section. (In order to use `state` (and `say`, which is nice) one needs to say `use feature qw(say state);`, which is in "The whole program".) Then, in it you replace the `getNames()` and `foreach my $file (@files) { }` by the ones given in the last, new, section – zdim Apr 24 '17 at 17:57
  • @Ultracrepidarian One more comment. If you have those HTML pages in files, then in the beginning of `getNames()` you only need `my ($file) = @_;` and then `my $tree = HTML::TreeBuilder->new_from_content( path($file) );`, and then the rest of the sub to process that. It's like in the first code section in my answer. (In this last version I fetch the page from the URL you gave so that I can test.) – zdim Apr 24 '17 at 18:08
  • @Ultracrepidarian What you are posting is not the "_The whole program_" from my answer. The subs are in the middle, the `fill_row()` doesn't have the closing `}` (so it shouldn't even run) ... please see my previous two comments -- you use the "_The whole program_", replace the `getNames()` sub and `foreach my $file (@files) { }` loop in the main, and change the first few lines in `getNames()` to take web pages from files (as in the first chunk of code in my answer.) I don't know what you are actually running so I don't know what's off. This produces the spreadsheet with posted image. – zdim Apr 24 '17 at 19:57
  • @Ultracrepidarian That means that `$table` is undefined, what means that in the previous line it got nothing by `$tree->look_down(...)`. Most likely something is different in the format of the HTML file from the one you showed here (and in your previous question). I have to go now ... I'll be able to look briefly only tonight, and then tomorrow. – zdim Apr 24 '17 at 20:08
  • So I couldn't figure out what was wrong with that part of it, so as a test, I ran it with the URL, as you did, and... It worked almost too well. It ended up printing those same 7 lines over five thousand times xD Any clue what went down? – Ultracrepidarian Apr 24 '17 at 20:41
  • @Ultracrepidarian I am running around here ... it would have to mean that the HTML pages you run it on are different than the ones in this question (and the previous one). So it's finding a lot more of those `MasterTable` or particular `td`'s or some such, and repeating. I don't see another reason. Look at the code and see what it does and adjust to your real pages... – zdim Apr 24 '17 at 23:35