0

I have a a function which return links from a given page using regular expression in php, Now I want to go after each link in found link and so on....

Here is the code I have

function getLinks($url){
$content = file_get_contents($url);
preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER);
$l_clean = array();
foreach($links[0] as $link){
        $e_link = explode("href",$link);
        $e_link = explode("\"",$e_link[1]);
        $f_link = $e_link[1];
        if( (substr($f_link,0,strlen('javascript:;')) != "javascript:;")){

            $sperator = "";

            $first = substr($f_link,0,1);
            if($first != "/"){
                $f_link = "/$f_link";
            }

            if(substr($f_link,0,7) != "http://"){
                $f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;              
            }
            $f_link = str_replace("///","//",$f_link);
            if(!in_array($f_link, $l_clean)){
                array_push($l_clean , $f_link);
            }
    }
}
}
Christian Rau
  • 45,360
  • 10
  • 108
  • 185
Core
  • 601
  • 2
  • 7
  • 19
  • 1
    possible duplicate of [I need help making a website crawler using php](http://stackoverflow.com/questions/4736906/i-need-help-making-a-website-crawler-using-php) -or- http://stackoverflow.com/questions/2313107/how-do-i-make-a-simple-crawler-in-php – mario Nov 21 '11 at 07:38

1 Answers1

1

Just do it recursively, and set a depth to terminate:

function getLinks($url, $depth){
    if( --$depth <= 0 ) return;
    $content = file_get_contents($url);
    preg_match_all("|<a [^>]+>(.*)</[^>]+>|U", $content, $links, PREG_PATTERN_ORDER);
    $l_clean = array();
    foreach($links[0] as $link){
            $e_link = explode("href",$link);
            $e_link = explode("\"",$e_link[1]);
            $f_link = $e_link[1];
            if( (substr($f_link,0,strlen('javascript:;')) != "javascript:;")){

                $sperator = "";

                $first = substr($f_link,0,1);
                if($first != "/"){
                    $f_link = "/$f_link";
                }

                if(substr($f_link,0,7) != "http://"){
                    $f_link = "http://" . $sperator . $_SERVER['HTTP_HOST'] . $f_link;              
                }
                $f_link = str_replace("///","//",$f_link);
                if(!in_array($f_link, $l_clean)){
                    array_push($l_clean , $f_link);
                    getLinks( $f_link, $depth );
                }
        }
    }
}

$links = getLinks("http://myurl.com", 3);
Core
  • 601
  • 2
  • 7
  • 19
Authman Apatira
  • 3,994
  • 1
  • 26
  • 33