1

I'm trying to use horseman to login to a site, find a set of links, open each one, scrape some data and return it.

Here is what I have so far...

function getLinks() {
    return horseman.evaluate(function () {
        var links = [];
        $('#cards > tbody > tr:not(:first-child) a').each(function (i, el) {
            links.push($(el).attr('href'));
        });
        return links;
    });
}

function scrapeDataFromLink(link) {
    return horseman
        .open(link)
        .waitForSelector('#details > fieldset > table')
        .evaluate(function () {
            var data = {
                name: $('#name > td:nth-child(2)').html().trim(),
                type: $('#type > td:nth-child(2)').html().trim(),
                expiry: $('#expiry > td:nth-child(2)').html().trim()
            };
            return data;
        }
    )
}


horseman
    .userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0')
    .open(LOGIN_URL)
    .type('input[id=username]', username)
    .type('input[id=password]', password)
    .click('[id="login"]')
    .waitForSelector('table[id=cards]')
    .then(getLinks)
    .then(function(links){
        var promises = [];

        links.forEach(function(link){
            promises.push(scrapeDataFromLink(link));

        });
        Promise.all(promises).then((result) => {
            console.log(result);
        })
    })

EDIT - I can now get back results using the code above but they are all the same result, using the last link in the chain. I think what is happening is the for each loop is opening each URL before the previous one has completed, so only results for the last opened link are returned. How can I make sure these promises are executed sequentially in sync?

ljenkins
  • 133
  • 2
  • 10

2 Answers2

2

I used the workOneByOne function outlined in an answer to this question to sequentially create and resolve promises. Not sure if this is the best way, but it works.

var scrapedData = []

function getLinks() {
    return horseman.evaluate(function () {
        var links = [];
        $('#cards > tbody > tr:not(:first-child) a').each(function (i, el) {
            links.push($(el).attr('href'));
        });
        return links;
     });
}

function scrapeDataFromLink(link) {
    return horseman
    .open(link)
    .waitForSelector('#details > fieldset > table')
    .evaluate(function () {
        var data = {
            name: $('#name > td:nth-child(2)').html().trim(),
            type: $('#type > td:nth-child(2)').html().trim(),
            expiry: $('#expiry > td:nth-child(2)').html().trim()
        };
        return data;
        }
    )
}


function workOneByOne(items, someAsyncFuntionReturningPromise) {
    var lastResultPromise = items
        .map(function(item) {
            return function(previousResult) {
                if (previousResult) {
                    console.log(previousResult);
                    scrapedData.push(previousResult);
                }

                return someAsyncFuntionReturningPromise(item);
            }})
        .reduce(Q.when, Q());

    return lastResultPromise;
}


horseman
.userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0')
.open(LOGIN_URL)
.type('input[id=username]', username)
.type('input[id=password]', password)
.click('[id="login"]')
.waitForSelector('table[id=cards]')
.then(getLinks)
.then(function(links){
    workOneByOne(links, scrapeDataFromLink).then(function(result) {
            scrapedData .push(result);
            res.setHeader('Content-Type', 'application/json');
            res.send(JSON.stringify(scrapedData ));
        })
})
Community
  • 1
  • 1
ljenkins
  • 133
  • 2
  • 10
0

You can consider this after the login

var links = ["http://link1.com","http://link2.com"];

    scrapeData = [];

        var it = 0;
    doit();

    function doit(){
    var Horseman = require("node-horseman");

             horseman
              .userAgent('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 
               Firefox/27.0')
              .open(links[it])
              .waitForSelector('#details > fieldset > table')
              .html('body')
              .then(function(html){  

               var $ = cheerio.load(html);  

               data = {
                name: $('#name > td:nth-child(2)').html().trim(),
                type: $('#type > td:nth-child(2)').html().trim(),
                expiry: $('#expiry > td:nth-child(2)').html().trim()
            };

           scrapeData.push(data);

           it = it + 1;

           if(it < links.length){
             doit();
            }

                return horseman.close();

        });
sese smith
  • 356
  • 3
  • 5