0

I've got double foreach loop. Script takes urls from one file and tries to find it in html code of pages from another file. Of course that reading so many pages is pretty hard for server so I want to optimize script but how can I do it?

Here is the code:

<?php
$sites_raw = file('https://earnmoneysafe.com/script/sites.txt');
$sites = array_map('trim', $sites_raw);
$urls_raw = file('https://earnmoneysafe.com/script/4toiskatj.txt');
$urls = array_map('trim', $urls_raw);

function file_get_contents_curl($url) {
    $ch = curl_init();
    $config['useragent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0';

    curl_setopt($curl, CURLOPT_USERAGENT, $config['useragent']);
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);       

    $data = curl_exec($ch);
    curl_close($ch);

    return $data;
}

foreach ($sites as $site){
    $homepage = file_get_contents_curl($site);
    foreach ($urls as $url){
        $needle   = $url;
        if (strpos($homepage, $needle) !== false) {
            echo 'true';
        }
    }
}
?>

  • 1
    You could use `curl_multi_exec()` to fetch all the URLs in parallel. – Barmar Feb 06 '23 at 17:11
  • 1
    FYI, if you're using `trim()` to remove the newlines, you can do that automatically with the `FILE_IGNORE_NEW_LINES` flag to the `file()` function. – Barmar Feb 06 '23 at 17:12
  • @Barmar I'm newbie with cUrl. I tried to do it with cUrl but got 403 from every page – Stack Stack Feb 06 '23 at 17:13
  • I can't see why the same request would return an error from `curl` but would work with `file_get_contents()`. Did you use the same user agent? Post your curl attempt. – Barmar Feb 06 '23 at 17:15
  • @Barmar posted cUrl attempt – Stack Stack Feb 06 '23 at 17:21
  • You probably need to set the user agent: https://stackoverflow.com/questions/17801094/php-curl-how-to-add-the-user-agent-value-or-overcome-the-servers-blocking-curl-r – Barmar Feb 06 '23 at 17:24
  • @Barmar yeah, problem was with user agent but some url still are not avaliable such as https://www.si.com/showcase/nutrition/best-weight-loss-programs – Stack Stack Feb 06 '23 at 17:33
  • 1
    Different sites use different techniques to prevent web scraping. They might be using a cookie. – Barmar Feb 06 '23 at 17:35
  • @Barmar OK. So I edited the code (change `file_get_contents()` to cUrl). But script still is too slow and gives `Request Timeout` error. How can I use `curl_multi_exec()`? Can you write an answer please? – Stack Stack Feb 06 '23 at 17:53

2 Answers2

1

Use curl_multi_exec() to fetch all the URLs in parallel.

$urls = file('https://earnmoneysafe.com/script/4toiskatj.txt', FILE_IGNORE_NEW_LINES);
$sites = file('https://earnmoneysafe.com/script/sites.txt', FILE_IGNORE_NEW_LINES);
foreach ($sites as $site) {
    $curl_handles[$site] = get_curl($site);
}
$mh = curl_multi_init();
foreach ($curl_handles as $ch) {
    curl_multi_add_handle($mh, $ch);
}

do {
    $mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);

foreach ($curl_handles as $site => $ch) {
    $homepage = curl_multi_getcontent($ch);
    foreach ($urls as $needle) {
        if (strpos($homepage, $needle) !== false) {
            echo 'true';
        }
    }
    curl_multi_remove_handle($mh, $ch);
}

curl_multi_close($mh);
    
function get_curl($url) {
    $ch = curl_init();
    $config['useragent'] = 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:17.0) Gecko/20100101 Firefox/17.0';

    curl_setopt($ch, CURLOPT_USERAGENT, $config['useragent']); // edited  
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE);
    curl_setopt($ch, CURLOPT_HEADER, 0);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);       

    return $ch;
}
Barmar
  • 741,623
  • 53
  • 500
  • 612
0

I think this, This code is cleaner

<?php

const SITES_URL = 'https://earnmoneysafe.com/script/sites.txt';
const URLS_URL = 'https://earnmoneysafe.com/script/4toiskatj.txt';

function readFileLines($url) {
    $file_contents = file_get_contents($url);
    $lines = explode("\n", $file_contents);
    $filtered_lines = array_filter($lines, function($line) {
        return !empty(trim($line));
    });

    return $filtered_lines;
}

function checkSiteUrls($site, $urls) {
    $homepage = file_get_contents($site);
    foreach ($urls as $url) {
        if (strpos($homepage, $url) !== false) {
            echo 'true';
        }
    }
}

$sites = readFileLines(SITES_URL);
$urls = readFileLines(URLS_URL);

foreach ($sites as $site) {
    checkSiteUrls($site, $urls);
}

?>