I have a huge list of curl handles (>1000). URL for each handle comes from database and every handle has same and different options like UA and Cookie. The current source I use generates much overhead. It works, but should be optimized with multi_options, multi_init, multi_close and so on. How has it to be done?
$url_result = $mysqli->query($check_url_query);
if ($url_result->num_rows > 0) {
while ($crawlUrls = $url_result->fetch_array()) {
$crawlerurl = $crawlUrls['url'];
$cch = curl_init($crawlerurl);
curl_setopt($cch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($cch, CURLOPT_HEADER, true);
curl_setopt($cch, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($cch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($cch, CURLOPT_ENCODING, 'gzip');
curl_setopt($cch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($cch, CURLOPT_TIMEOUT, 10);
curl_setopt($cch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($cch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($cch, CURLOPT_NOBODY, false);
curl_setopt($cch, CURLOPT_HTTPHEADER, array("Cache-Control: max-age=0,no-store,no-cache"));
curl_setopt($cch, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($cch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.47 Safari/537.36");
curl_exec($cch);
sleep(0.300);
curl_close($cch);
// .....
$cch55 = curl_init($crawlerurl);
curl_setopt($cch55, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($cch55, CURLOPT_HEADER, true);
curl_setopt($cch55, CURLOPT_CUSTOMREQUEST, 'GET');
curl_setopt($cch55, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($cch55, CURLOPT_ENCODING, 'gzip');
curl_setopt($cch55, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($cch55, CURLOPT_TIMEOUT, 10);
curl_setopt($cch55, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($cch55, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($cch55, CURLOPT_NOBODY, false);
curl_setopt($cch55, CURLOPT_HTTPHEADER, array("Cache-Control: max-age=0,no-store,no-cache"));
curl_setopt($cch55, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_1);
curl_setopt($cch55, CURLOPT_USERAGENT, "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15");
curl_setopt($cch55, CURLOPT_COOKIE, "_lscache_vary_esi= ; _lscache_vary_dsg=");
sleep(0.300);
curl_close($cch55);
}
}
[UPDATE]
Re-builded code for better control of timeouts and multi-threading
function multi_thread_curl($urlArray, $optionArray, $nThreads) {
$curlArray = array_chunk($urlArray, $nThreads, $preserve_keys = true);
foreach ($curlArray as $threads) {
foreach ($threads as $key => $value) {
${'ch' . $key} = curl_init();
curl_setopt_array(${'ch' . $key}, $optionArray); //Set your main curl options.
curl_setopt(${'ch' . $key}, CURLOPT_URL, $value); //Set url.
}
$mh = curl_multi_init();
foreach ($threads as $key => $value) {
curl_multi_add_handle($mh, ${'ch' . $key});
}
$active = null;
do {
$mrc = curl_multi_exec($mh, $active);
usleep(500);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
foreach ($threads as $key => $value) {
$results[$key] = curl_multi_getcontent(${'ch' . $key});
curl_multi_remove_handle($mh, ${'ch' . $key});
}
curl_multi_close($mh);
}
return $results;
}
$optionArray = array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_HEADER => true,
CURLOPT_CUSTOMREQUEST => 'GET',
CURLOPT_FOLLOWLOCATION => false,
CURLOPT_ENCODING => 'gzip',
CURLOPT_CONNECTTIMEOUT => 10,
CURLOPT_TIMEOUT => 10,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => false,
CURLOPT_NOBODY => false,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.47 Safari/537.36",
CURLOPT_HTTPHEADER => array('Cache-Control: max-age=0,no-store,no-cache'),
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1
);
$nThreads = 3;
$crawler_query = "SELECT url FROM litecache_url WHERE blacklisted != 1";
$crawler_query_result = $mysqli->query($crawler_query);
if ($crawler_query_result->num_rows > 0) {
while ($crawlUrls = $crawler_query_result->fetch_array()) {
$crawlerurl = $crawlUrls['url'];
$urlArray = array($crawlerurl);
$results = multi_thread_curl($urlArray, $optionArray, $nThreads);
}
}