I've written a scraper that iterates through every page on a website and extracts the information. There are a lot of pages; if this program was functioning non-stop it would take about a week to finish. However, every two or three hours it just hangs when it tries to extract the info from the page, and it never continues. This is frustrating because I keep having to restart the script. Here is the skeleton of it, run using NodeJS:
index = 0;
finalIndex = 50000;
function scrape(){
if(index < finalIndex){
//hit the website using nightmare, navigate to page, extract info, store as JSON
console.log("finished scraping page number: ", index);
index++;
scrape();
}
}
scrape();
I'd like to have a function, in this file or another, that runs the scrape function, and then every 2 hours kills the function and restarts it from the last index that it tried to scrape from. I've tried thinking of formulations using setTimeout, but I'm not sure how to kill a function stack half-way through. I also don't want the restarting function to fail if the scrape function has already started hanging.
What's the best way for me to do this? Other solutions to this problem are welcome, but even from a JavaScript knowledge standpoint I'd like to know how to do this for the future.
Here is my function in a bit more detail:
function scrape() {
console.log("initializing scrape from index: " + index);
var nightmare = Nightmare();
if (index < indexEnd) {
nightmare
.goto(hidTestURL) //connect to the main site
.wait('input[name="propertySearchOptions:advanced"]')
.wait(4000)
.goto(pageURL) //navigate to the specific entry's info page
.wait('a[id="propertyHeading_searchResults"]')
.wait(2500)
.evaluate(function(){
return document.querySelector('body').innerHTML;
})
.then(function(html){
return xP([html, {data: css.data}])() //scrape the data from the page
})
.then(cleanDetails)
.then(writeResult)
.then(_ => {
nightmare.end();
nightmare.proc.disconnect();
nightmare.proc.kill();
nightmare.ended = true;
nightmare = null;
})
.then(function(){
console.log("successful scrape for ", ids[index]);
++index;
setTimeout(scrape(), interval); //start scraping the next entry after a specified delay (default 4 seconds)
})
.catch(function(e){
if (e.message === 'EmptyProperty'){
console.log('EmptyProperty');
++index;
setTimeout (scrape, interval / 2);
}
else {
return appendFileP(logFile, new Date().toString() + " unhandled error at " + street + index + ' ' + e + '\r\n', 'utf8')
.then(function(){
if (numOfTries < 2){
console.log("Looks like some other error, I'll retry: %j", e.message);
++numOfTries;
setTimeout (scrape, interval * 5);
return nightmare.end();
}
else {
console.log("Tried 3 times, moving on");
++index;
numOfTries = 0;
setTimeout (scrape, interval * 5);
return nightmare.end();
}
});
}
})
}
There are helper functions whose code I haven't included, but their names should be obvious, and I don't think their function is an important part of the problem. I also want to make it clear that I'm running this using Node, it never runs in a browser.