I am following a tutorial that shows how to write a program that parses a web page and finds all the links. However, this program works only on pages that use http. Whenever I try to run it against a site that has a certificate (https) it throws the following error:
Fatal error: Uncaught ValueError: DOMDocument::loadHTML(): Argument #1 ($source) must not be empty in C:\xampp\htdocs\froogal\classes\DomDocumentParser.php:14 Stack trace: #0 C:\xampp\htdocs\froogal\classes\DomDocumentParser.php(14): DOMDocument->loadHTML('') #1 C:\xampp\htdocs\froogal\crawl.php(6): DomDocumentParser->__construct('http://www.appl...') #2 C:\xampp\htdocs\froogal\crawl.php(18): followLinks('http://www.appl...') #3 {main} thrown in C:\xampp\htdocs\froogal\classes\DomDocumentParser.php on line 14
The code for the DomDocumentParser.php file is:
<?php
class DomDocumentParser {
private $doc;
public function __construct($url) {
$options = array(
'http'=>array('method'=>"GET", 'header'=>"User-Agent: doodleBot/0.1\n")
);
$context = stream_context_create($options);
$this->doc = new DomDocument();
@$this->doc->loadHTML(file_get_contents($url, false, $context));
}
public function getlinks() {
return $this->doc->getElementsByTagName("a");
}
}
?>
And the code for crawl.php is:
<?php
include("classes/DomDocumentParser.php");
function followLinks($url) {
$parser = new DomDocumentParser($url);
$linkList = $parser->getLinks();
foreach($linkList as $link) {
$href = $link->getAttribute("href");
echo $href . "<br>";
}
}
$startUrl = "http://www.apple.com";
followLinks($startUrl);
?>