The script below contains some URLs in links
array. The function gatherLinks()
is used to gather more URLs from sitemap.xml of the URLs in links
array. Once the links
array has enough URLs (decided by variable limit
), function request()
is called for each URL in links
array to send a request to the server, fetch the response and save the image using page.render()
function.
The problem is that when I run it using PhantomJS 2.0.0, many of the images lack a lot of content, i.e. PhantomJS probably doesn't wait for all content to load. But when I use PhantomJS 1.9.8, all content is loaded just fine. What could be the reason?
var webpage = require('webpage');
var system = require('system');
var fs = require('fs');
var links = [];
links = [
"http://somesite.com",
"http://someothersite.com",
.
.
.
];
var index = 0, fail = 0, limit = 20;
finalTime = Date.now();
var gatherLinks = function(link){
var page = webpage.create();
link = link + "/sitemap.xml";
console.log("Fetching links from " + link);
page.open(link, function(status){
if(status != "success"){
console.log("Sitemap Request FAILED, status: " + status);
fail++;
return;
}
var content = page.content;
parser = new DOMParser();
xmlDoc = parser.parseFromString(content, 'text/xml');
var loc = xmlDoc.getElementsByTagName('loc');
for(var i = 0; i < loc.length; i++){
if(links.length < limit){
links[links.length] = loc[i].textContent;
} else{
console.log(links.length + " Links prepared. Starting requests.\n");
index = 0;
page.close();
request();
return;
}
}
if(index >= links.length){
index = 0;
console.log(links.length + " Links prepared\n\n");
page.close();
request();
return;
}
page.close();
gatherLinks(links[++index]);
});
};
var request = function(){
t = Date.now();
var page = webpage.create();
page.open(links[index], function(status) {
console.log('Loading link #' + (index + 1) + ': ' + links[index]);
console.log("Time taken: " + (Date.now() - t) + " msecs");
if(status != "success"){
console.log("Request FAILED, status: " + status);
fail++;
}
page.render("img_200_" + index + ".jpeg", {format: 'jpeg', quality: '100'});
if(index >= links.length-1){
console.log("\n\nAll links done, final time taken: " + (Date.now() - finalTime) + " msecs");
console.log("Requests sent: " + links.length + ", Failures: " + fail);
console.log("Success ratio: " + ((links.length - fail)/links.length)*100 + "%");
page.close();
phantom.exit();
}
index++;
page.close();
request();
});
}
gatherLinks(links[0]);