Have a few existing sites that I need to input all of the URLS and content into Elasticsearch, so using some examples, I built a way to grab all of the internal links on the site and put them in an array. The problem is every example I've seen is not fully recursive, there's a depth setting. I've been trying for the past 6 hours (seriously) of different ways to make it fully recursive. Here's how I am doing it now, but I think I'm infinite looping and crashing because after a minute of running I get no errors, just a "No Data Received" page. I'm open to any suggestions on a better approach.
<?php
set_time_limit (1209600);
ini_set('memory_limit', '-1');
$seen = array();
$urls = crawl_page("http://example.com", $seen);
foreach($urls as $url){
echo $url.'<br />';
}
function crawl_page($url, $seen){
//CURL TO GRAB PAGE
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
$result = curl_exec ($ch);
curl_close ($ch);
//URL NO HTTP/S OR WWW
$urlStripped = str_replace('www', '', $url);
$urlStripped = str_replace('http://', '', $urlStripped);
$urlStripped = str_replace('https://', '', $urlStripped);
//ADD THIS URL TO THE ARRAY
if(!in_array($url, $seen)){
$seen[] = $url;
}
//GET ALL LINKS IN PAGE
$stripped_file = strip_tags($result, "<a>");
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $stripped_file, $matches, PREG_SET_ORDER );
foreach($matches as $match){
$href = $match[1];
//MAKE SURE LINK ISNT DUPLICATE AND MAKE SURE THE LINK IS INTERNAL
if(!in_array($href, $seen) && is_in_string($urlStripped, $href)){
$seen[] = $href;
}
}
//HERE'S WHERE THE PROBLEM LIES, ATTEMPTING TO MAKE THIS RECURSIVE.
//I'VE DONE THIS MANY DIFFERENT WAYS WITH NO LUCK.
//I DON'T REALLY HAVE A REASON FOR IT'S CURRENT STATE.
//I ENDED UP TAKING SHOTS IN THE DARK AND THATS WHAT LED ME TO ASK ON STACKOVERFLOW
$seenTemp1 = $seen;
foreach($seenTemp1 as $aUrl){
$seenTemp2 = crawl_page($aUrl, $seenTemp1);
$seen = array_merge($seen, $seenTemp2);
}
//RETRUN ARRAY
return $seen;
}
function is_in_string($needle, $string){
$before = strlen($string);
$after = strlen(str_replace($needle, '', $string));
if($before != $after){
return true;
}
else{
return false;
}
}
?>