4

Right I have built a web crawler. It scans links, titles and meta descriptions. It scans the links and saves them in $link. It scans the titles of the links and saves them in [title] in newArray. Now in this array, I want to it to know that if there are no meta-tags, it can scan for the first p tag and use that instead. The only problem is that it doesn't seem to be saving any info at all.

function getMetas($link) {
$str1 = file_get_contents($link);    

    if (strlen($str1)>0) {
 preg_match_all( '/<meta.*?name=("|\')description("|\').*?content=("|\')(.*?)("|\')/i', $str1, $description);
   if (count($description) > 1) {
    return $description[4];   
   }


   }
 return '';
   if ($description == '') {
$html = file_get_contents($link);    
preg_match('%(<p[^>]*>.*?</p>)%i', $html, $re);
$res = get_custom_excerpt($re[1]);
echo "\n";
echo $res;
echo "\n";

}

    function get_custom_excerpt($return, $option = 30, $sentance = false) {
    $marks = Array(".","!","?");

   $return = strip_tags($return);

  if($sentance == true) {
$start = implode(" ", array_slice(preg_split("/\s+/", $return), 0, $option ));
$start .= ' ';
$end = implode(" ", array_slice(preg_split("/\s+/", $return), $option));

$cut = Array();
foreach($marks AS $m => $mark){
  $mark = strpos($end, $mark);
  if($mark != false) $cut[$m] = $mark;
}

if($cut[0] != "")
  $chop = min($cut);
    else
      $chop = $option;
$rest = substr($end, 0, $chop);

$key = array_search($chop, $cut);

$return = $start.$rest;

   }else{
$return = implode(" ", array_slice(preg_split("/\s+/", $return), 0, $option));
   }
  $return .= $marks[$key];

  return $return; 
   }  

   }


 $output = Array();

 foreach ($links as $thisLink) {
 $output[] = array("link" => $thisLink, "title" => Titles($thisLink), "description" => getMetas($thisLink), getMetas($res));
 } 
  print_r($output);
Toon Krijthe
  • 52,876
  • 38
  • 145
  • 202
Noah Smith
  • 203
  • 4
  • 9
  • 3
    I would suggest using XPath to select elements and content instead of regular expressions. – str Sep 24 '12 at 07:35

1 Answers1

2

Your regex may not work. Attributes may not in your desired order. it could be <meta name="" content=""> or <meta content="" name="">.

why don't you use an XML parser? Most HTML is valid enough to be used for parsing.

Please have a look at PHP Parse HTML code

Community
  • 1
  • 1
Xandl
  • 122
  • 1
  • 7
  • This would be a elegant solution that would allow for the inconcistancy in coding styles between developers. For example the order of attributes within the meta tag, the XML parser won't really care. Nice one :) – Chris Sep 24 '12 at 08:30