-1

I have a huge list of curl handles (>1000). URL for each handle comes from database and every handle has same and different options like UA and Cookie. The current source I use generates much overhead. It works, but should be optimized with multi_options, multi_init, multi_close and so on. How has it to be done?

$url_result = $mysqli->query($check_url_query);
if ($url_result->num_rows > 0) {
    while ($crawlUrls = $url_result->fetch_array()) {
        $crawlerurl = $crawlUrls['url'];
        $cch = curl_init($crawlerurl);
        curl_setopt($cch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($cch, CURLOPT_HEADER, true);
        curl_setopt($cch, CURLOPT_CUSTOMREQUEST, 'GET');
        curl_setopt($cch, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($cch, CURLOPT_ENCODING, 'gzip');
        curl_setopt($cch, CURLOPT_CONNECTTIMEOUT, 10);
        curl_setopt($cch, CURLOPT_TIMEOUT, 10);
        curl_setopt($cch, CURLOPT_SSL_VERIFYHOST, 0);
        curl_setopt($cch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($cch, CURLOPT_NOBODY, false);
        curl_setopt($cch, CURLOPT_HTTPHEADER, array("Cache-Control: max-age=0,no-store,no-cache"));
        curl_setopt($cch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($cch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.47 Safari/537.36");
        curl_exec($cch);
        sleep(0.300);
        curl_close($cch);

        // .....

        $cch55 = curl_init($crawlerurl);
        curl_setopt($cch55, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($cch55, CURLOPT_HEADER, true);
        curl_setopt($cch55, CURLOPT_CUSTOMREQUEST, 'GET');
        curl_setopt($cch55, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($cch55, CURLOPT_ENCODING, 'gzip');
        curl_setopt($cch55, CURLOPT_CONNECTTIMEOUT, 10);
        curl_setopt($cch55, CURLOPT_TIMEOUT, 10);
        curl_setopt($cch55, CURLOPT_SSL_VERIFYHOST, 0);
        curl_setopt($cch55, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($cch55, CURLOPT_NOBODY, false);
        curl_setopt($cch55, CURLOPT_HTTPHEADER, array("Cache-Control: max-age=0,no-store,no-cache"));
        curl_setopt($cch55, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($cch55, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15");
        curl_setopt($cch55, CURLOPT_COOKIE, "_lscache_vary_esi= ; _lscache_vary_dsg=");
        sleep(0.300);
        curl_close($cch55);
    }
}

[UPDATE]

Re-builded code for better control of timeouts and multi-threading

    function multi_thread_curl($urlArray, $optionArray, $nThreads) {
        $curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);

        foreach ($curlArray as $threads) {
            foreach ($threads as $key => $value) {
                ${'ch' . $key} = curl_init();
                curl_setopt_array(${'ch' . $key}, $optionArray); //Set your main curl options.
                curl_setopt(${'ch' . $key}, CURLOPT_URL, $value); //Set url.
            }

            $mh = curl_multi_init();

            foreach ($threads as $key => $value) {
                curl_multi_add_handle($mh, ${'ch' . $key});
            }

            $active = null;

            do {
                $mrc = curl_multi_exec($mh, $active);
                usleep(500);
            } while ($mrc == CURLM_CALL_MULTI_PERFORM);

            while ($active && $mrc == CURLM_OK) {
                if (curl_multi_select($mh) != -1) {
                    do {
                        $mrc = curl_multi_exec($mh, $active);
                    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
                }
            }

            foreach ($threads as $key => $value) {
                $results[$key] = curl_multi_getcontent(${'ch' . $key});
                curl_multi_remove_handle($mh, ${'ch' . $key});
            }

            curl_multi_close($mh);
        }
        return $results;
    }

    $optionArray = array(
        CURLOPT_RETURNTRANSFER => 1,
        CURLOPT_HEADER => true,
        CURLOPT_CUSTOMREQUEST => 'GET',
        CURLOPT_FOLLOWLOCATION => false,
        CURLOPT_ENCODING => 'gzip',
        CURLOPT_CONNECTTIMEOUT => 10,
        CURLOPT_TIMEOUT => 10,
        CURLOPT_SSL_VERIFYHOST => 0,
        CURLOPT_SSL_VERIFYPEER => false,
        CURLOPT_NOBODY => false,
        CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.47 Safari/537.36",
        CURLOPT_HTTPHEADER => array('Cache-Control: max-age=0,no-store,no-cache'),
        CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1
    );

    $nThreads = 3;

    $crawler_query = "SELECT url FROM litecache_url WHERE blacklisted != 1";
    $crawler_query_result = $mysqli->query($crawler_query);

    if ($crawler_query_result->num_rows > 0) {
        while ($crawlUrls = $crawler_query_result->fetch_array()) {
            $crawlerurl = $crawlUrls['url'];

            $urlArray = array($crawlerurl);
            $results = multi_thread_curl($urlArray, $optionArray, $nThreads);
        }
    }
  • **[You should not switch off `CURLOPT_SSL_VERIFYHOST` or `CURLOPT_SSL_VERIFYPEER`](https://paragonie.com/blog/2017/10/certainty-automated-cacert-pem-management-for-php-software)**. It could be a security risk! [Here is how to get the certificate bundle if your server is missing one](https://stackoverflow.com/a/32095378/1839439) – Dharman Nov 20 '20 at 18:49

1 Answers1

0

you're absolutely right that this is a job for curl_multi, it'd look something like

if ($url_result->num_rows > 0) {
    $mh = curl_multi_init();
    $workers = [];
    $max_workers = 100; // << AKA max simultaneous connections
    $results = [];
    $work = function () use (&$workers, &$mh, &$max_workers, &$results) {
        assert(count($workers) > 0, "must never be called with 0 workers...");
        $still_running = null;
        for (;;) {
            // todo: CURLM_OK check
            curl_multi_exec($mh, $still_running);
            if ($still_running < count($workers)) {
                break;
            }
            curl_multi_select($mh, 1);
        }
        while (false !== ($info = curl_multi_info_read($mh))) {
            if ($info['msg'] !== CURLMSG_DONE) {
                continue;
            }
            if ($info['result'] !== CURLM_OK) {
                // cba doing proper error detection
                throw new \RuntimeException("curl error");
            }
            $key = (int) $info["handle"];
            $results[] = [
                curl_getinfo($info['handle'], CURLINFO_EFFECTIVE_URL),
                curl_multi_getcontent($info['handle'])
            ];
            curl_multi_remove_handle($mh, $info['handle']);
            curl_close($info['handle']);
            unset($workers[$key]);
        }
    };

    while ($crawlUrls = $url_result->fetch_array()) {
        while (count($workers) >= $max_workers) {
            $work();
        }
        $crawlerurl = $crawlUrls['url'];
        $cch = curl_init($crawlerurl);
        curl_setopt($cch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($cch, CURLOPT_HEADER, true);
        curl_setopt($cch, CURLOPT_CUSTOMREQUEST, 'GET');
        curl_setopt($cch, CURLOPT_FOLLOWLOCATION, false);
        curl_setopt($cch, CURLOPT_ENCODING, 'gzip');
        curl_setopt($cch, CURLOPT_CONNECTTIMEOUT, 10);
        curl_setopt($cch, CURLOPT_TIMEOUT, 10);
        curl_setopt($cch, CURLOPT_SSL_VERIFYHOST, 0);
        curl_setopt($cch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($cch, CURLOPT_NOBODY, false);
        curl_setopt($cch, CURLOPT_HTTPHEADER, array(
            "Cache-Control: max-age=0,no-store,no-cache"
        ));
        curl_setopt($cch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
        curl_setopt($cch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.47 Safari/537.36");
        curl_multi_add_handle($mh, $cch);
        $workers[(int)$cch] = $cch;
    }
    while(count($workers) > 0){
        $work();
    }
    curl_multi_close($mh);
    var_dump($results);
}

this will download up to 100 pages simultaneously (configurable with $max_workers = 100; ) and should be much faster than downloading 1-and-1 page

but if it's so many jobs or so big responses that you can't even fit it in memory, you'll have to replace

            $results[] = [
                curl_getinfo($info['handle'], CURLINFO_EFFECTIVE_URL),
                curl_multi_getcontent($info['handle'])
            ];

with something else, perhaps putting the responses in a database or something

also protip, you should not specify any specific encoding in curl_setopt($cch, CURLOPT_ENCODING, 'gzip');, because you may get problems if you use an encoding which libcurl wasn't built to support (eg if your libcurl wasn't built to support gzip), and because curl may be built with support for something even better than gzip in the future, instead you should just set it to emptystring, eg curl_setopt($cch, CURLOPT_ENCODING, '');, this will instruct libcurl to offer all encodings libcurl support, rather than only gzip (gzip and deflate is very commonly compiled in though)

hanshenrik
  • 19,904
  • 4
  • 43
  • 89
  • @Dharman I respect your warning, but I think in this case the risk can be tolerated. The code I published is for internal usage only. It is meant for a crawler like function where curl requests the host where it is installed to warmup a HTTP cache. – Serpentdriver Nov 21 '20 at 05:50
  • @Serpentdriver if it's just to warm up a cache, you don't even need the results, right? in that case, you can replace `CURLOPT_RETURNTRANSFER` with a `CURLOPT_WRITEFUNCTION=>function($ch,string $data){return strlen($data);}` which throws away all the downloaded data with practically no ram usage whatsoever, then you won't have to worry about ram even if it's a million urls – hanshenrik Nov 21 '20 at 08:34
  • @ hanshenrik I need results for logging, but only specific headers like status code and cache header. I have still problems to use your code example, but will use it controll multi threading. – Serpentdriver Nov 21 '20 at 09:39
  • @Serpentdriver if you only need the headers, you can still replace CURLOPT_RETURNTRANSFER with `CURLOPT_WRITEFUNCTION`+`CURLOPT_HEADERFUNCTION`, which should still use way less RAM than CURLOPT_RETURNTRANSFER ^^ – hanshenrik Nov 21 '20 at 09:41
  • @Serpentdriver what problems did you get with the sample code tho? – hanshenrik Nov 21 '20 at 09:42
  • Complecated to explain what problems I still have, but will try it. URLs come from database and every curl request has same, but also different options like UA and cookie value. My current code lists each variation of request with its options and this is not really nice code. I want to group it to reduce overhead. You know what I mean? – Serpentdriver Nov 21 '20 at 09:57
  • @Serpentdriver for that i would probably have a JSON row in the database with the appropriate curlopts for every url. something like ```while ($crawlUrls = $url_result->fetch_array()) { $url = $crawlUrls["url"];$opts=json_decode($crawlUrls["curlopts"],true);curl_setopt_array($ccu,$opts);``` – hanshenrik Nov 21 '20 at 10:15
  • @ hanshenrik I have updated my code above and completely rebuild my code for better control of timeouts and multi-threading. It works better and faster, but still have the problem with up to 60 different options for each handle and don't know how to modify function multi_thread_curl to get what I need. Any ideas? – Serpentdriver Nov 23 '20 at 10:08