I'm trying to use horseman to login to a site, find a set of links, open each one, scrape some data and return it.
Here is what I have so far...
function getLinks() {
return horseman.evaluate(function () {
var links = [];
$('#cards > tbody > tr:not(:first-child) a').each(function (i, el) {
links.push($(el).attr('href'));
});
return links;
});
}
function scrapeDataFromLink(link) {
return horseman
.open(link)
.waitForSelector('#details > fieldset > table')
.evaluate(function () {
var data = {
name: $('#name > td:nth-child(2)').html().trim(),
type: $('#type > td:nth-child(2)').html().trim(),
expiry: $('#expiry > td:nth-child(2)').html().trim()
};
return data;
}
)
}
horseman
.userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0')
.open(LOGIN_URL)
.type('input[id=username]', username)
.type('input[id=password]', password)
.click('[id="login"]')
.waitForSelector('table[id=cards]')
.then(getLinks)
.then(function(links){
var promises = [];
links.forEach(function(link){
promises.push(scrapeDataFromLink(link));
});
Promise.all(promises).then((result) => {
console.log(result);
})
})
EDIT - I can now get back results using the code above but they are all the same result, using the last link in the chain. I think what is happening is the for each loop is opening each URL before the previous one has completed, so only results for the last opened link are returned. How can I make sure these promises are executed sequentially in sync?