0

I must have a memory leak or something that is just eating memory on my server somewhere in this class. For example if I file_get_contents(http://www.theknot.com) it will not be able to connect to the server tho its not down, or mysql closes the connection, or in extreme situations completed knock out the server for a mount of time that we can not even get a ping. I know its somewhere within the preg_match_all if block, but I dont know what would get run away to what I can only assume is a lot of processing on the regex match due to whatever is within the content that is fetched from the remote site. Any ideas?

<?php 

class Utils_Linkpreview extends Zend_Db_table
{

    public function getPreviews($url) {
        $link = $url;
        $width = 200;
        $height = 200;
        $regex = '/<img[^\/]+src="([^"]+\.(jpe?g|gif|png))/';
        /// $regex = '/<img[^\/]+src="([^"]+)/';
        $thumbs = false;

        try {
            $data = file_get_contents($link);

        } catch (Exception $e) {
            print "Caught exception when attempting to find images: ". $e->getMessage(). "\n";
        }

        if (($data) && preg_match_all($regex, $data, $m, PREG_PATTERN_ORDER)) {
            if (isset($m[1]) && is_array($m[1])) {
                $thumbs = array();
                foreach (array_unique($m[1]) as $url) {
                    if (
                            ($url = $this->rel2abs($url, $link)) &&
                            ($i = @getimagesize($url)) &&
                            $i[0] >= ($width-10) &&
                            $i[1] >= ($height-10)
                    ) {
                        $thumbs[] = $url;
                    }
                }
            }

        }
        return $thumbs;
    }

    private function rel2abs($url, $host) {
        if (substr($url, 0, 4) == 'http') {
            return $url;
        } else {
            $hparts = explode('/', $host);

            if ($url[0] == '/') {
                return implode('/', array_slice($hparts, 0, 3)) . $url;
            } else if ($url[0] != '.') {
                array_pop($hparts);
                return implode('/', $hparts) . '/' . $url;
            }
        }
    }
}
?>

EDIT - Amal Murali's comment pointed me in a better direction using PHP's DomDocument. Thanks bud!

Here is the result:

public function getPreviews($url) {
    $link = $url;
    $thumbs = false;

    try {
        $html = file_get_contents($link);

    } catch (Exception $e) {
        print "Caught exception when attempting to find images: ". $e->getMessage(). "\n";
    }

    $dom = new DOMDocument();
    @$dom->loadHTML($html);
    $x = new DOMXPath($dom); 

    foreach($x->query("//img[@width > 200 or substring-before(@width, 'px') > 200 or @height > 200 or substring-before(@height, 'px') > 200]") as $node) 
    {
        $url = $node->getAttribute("src");
        $thumbs[] = $this->rel2abs($url, $link);
    }       

    return $thumbs;
}
LeviXC
  • 1,075
  • 2
  • 15
  • 32
  • **it (My browser) will not be able to connect to the server (my server with the above code) tho its not down, or mysql closes the connection, or in extreme situations completed knock out the server for a mount of time that we can not even get a ping. – LeviXC Aug 27 '13 at 17:13
  • 1
    Why aren't you using an [HTML parser](http://stackoverflow.com/questions/3577641/how-do-you-parse-and-process-html-xml-in-php)? – Amal Murali Aug 27 '13 at 17:15
  • I guess my age is showing (ha). Regex, roll your own for scraping, is all I have known. – LeviXC Aug 27 '13 at 17:21
  • Thanks man, DOMdoc did the trick! – LeviXC Aug 29 '13 at 02:08
  • You're welcome. I'd suggest posting the solution as an answer and marking it as *accepted solution*. – Amal Murali Aug 29 '13 at 11:55

1 Answers1

0

EDIT - Amal Murali's comment pointed me in a better direction using PHP's DomDocument. Thanks bud!

Here is the result:

public function getPreviews($url) { $link = $url; $thumbs = false;

try {
    $html = file_get_contents($link);

} catch (Exception $e) {
    print "Caught exception when attempting to find images: ". $e->getMessage(). "\n";
}

$dom = new DOMDocument();
@$dom->loadHTML($html);
$x = new DOMXPath($dom); 

foreach($x->query("//img[@width > 200 or substring-before(@width, 'px') > 200 or @height > 200 or substring-before(@height, 'px') > 200]") as $node) 
{
    $url = $node->getAttribute("src");
    $thumbs[] = $this->rel2abs($url, $link);
}       

return $thumbs;

}

LeviXC
  • 1,075
  • 2
  • 15
  • 32