I have been trying to get this script to work for literally days and I think that I'm almost there but it isn't quite working. My understanding of PHP is limited and I have hacked this script together from a lot of other scripts.
I have separated the script in to two parts. Each of these parts work when run on their own but when they are placed together the page never loads.
The first part checks a URL for any external links ignoring any nofollow links.
The second part checks the server response header for a url
The whole script should find any external, followed links on a web page and then check if any of links are broken.
Any help to get this working would be very much appreciated.
<?php
// This is the first part of the script to get a list of external links from a web page ignoring any nofollow links
// Set the parent URL
$url = 'http://www.example.com';
$pUrl = parse_url($url);
// Load the HTML into a DOMDocument
$doc = new DOMDocument;
@$doc->loadHTMLFile($url);
// Look for all the 'a' elements
$links = $doc->getElementsByTagName('a');
$numLinks = 0;
foreach ($links as $link) {
// Exclude if not a link or has 'nofollow'
preg_match_all('/\S+/', strtolower($link->getAttribute('rel')), $rel);
if (!$link->hasAttribute('href') || in_array('nofollow', $rel[0])) {
continue;
}
// Exclude if internal link
$href = $link->getAttribute('href');
if (substr($href, 0, 2) === '//') {
// Deal with protocol relative URLs as found on Wikipedia
$href = $pUrl['scheme'] . ':' . $href;
}
$pHref = @parse_url($href);
if (!$pHref || !isset($pHref['host']) ||
strtolower($pHref['host']) === strtolower($pUrl['host'])
) {
continue;
}
// Increment counter otherwise
echo $link->getAttribute('href') . " - ";
$numLinks++;
// This is the second part of the script to check to see if the link returns no response or a 404 response.
// Reset $checkurl
$checkurl = '';
// Set the URL to check server response code
$checkurl = $link->getAttribute('href');
// Check header response for URL
file_get_contents($checkurl);
$response = $http_response_header[0];
// If 404 exists in response then set as 404
if (strpos($response,'404') !== false) {
$server_response = '404';
}
// If there is no response then set as 404
if ($response == '') {
$server_response = '404';
}
echo $server_response;
echo '<br>';
}
?>