In Perl, the quickest and best way, I know to scan HTML is HTML::PullParser
. This is based on a robust HTML parser, not simple FSA like Perl regex (without recursion).
This is more like a SAX filter, than a DOM.
use 5.010;
use constant NOT_FOUND => -1;
use strict;
use warnings;
use English qw<$OS_ERROR>;
use HTML::PullParser ();
my $pp
= HTML::PullParser->new(
# your file or even a handle
file => 'my.html'
# specifies that you want a tuple of tagname, attribute hash
, start => 'tag, attr'
# you only want to look at tags with tagname = 'a'
, report_tags => [ 'a' ],
)
or die "$OS_ERROR"
;
my $anchor_url;
while ( defined( my $t = $pp->get_token )) {
next unless ref $t or $t->[0] ne 'a'; # this shouldn't happen, really
my $href = $t->[1]->{href};
if ( index( $href, 'schule.php?' ) > NOT_FOUND ) {
$anchor_url = $href;
last;
}
}