1

I'm trying to build a little script to scrap some data. I'm some basics knowledge in javascript however I'm kind of lost with all the async callback or promises stuff. Here is what I have now :

url = "http://Blablablabla.com";

var shares = function(req, res) {
    request(url, function (error, response, body) {
  if (!error) {
    var $ = cheerio.load(body),
      share = $(".theitemIwant").html();

    return res.send(url + ":" + share);
  } else {
    console.log("We've encountered an error: " + error);
  }
})

}

So everything is fine with this piece of code. What I would like to do is :

  1. Using an array of url var urls = [url1,url2,url3,etc...]
  2. Storing my scrapped data into another array, something like this data = [{url: url1, shares: share},{url: url2, shares: share},etc...]

I know I need to use something like this data.push({ urls: url, shares: share})})

and I understand that I need to loop over my first url array to push data into my second data array.

however I'm kind of lost with the request method and the way I should deal with async issue in my situation.

thanks !

edit#1 :

I tried this to use promises :

var url = "www.blablabla.com"
var geturl = request(url, function (error, response, body) {
  if (!error) { return $ = cheerio.load(body) } else 
  { console.log("We've encountered an error: " + error); }
});

var shares = geturl.then( function() {
    return $(".nb-shares").html();
})

but got the following error geturl.then is not a function

tadman
  • 208,517
  • 23
  • 234
  • 262
Simon Breton
  • 2,638
  • 7
  • 50
  • 105

3 Answers3

3

I think you should use async:

var async = require('async');

var urls = ["http://example.com", "http://example.com", "http://example.com"];
var data = [];
var calls = urls.map((url) => (cb) => {
    request(url, (error, response, body) => {
        if (error) {
            console.error("We've encountered an error:", error);
            return cb();
        }
        var $ = cheerio.load(body), 
            share = $(".theitemIwant").html();
        data.push({ url, share })
    })
})

async.parallel(calls, () => { /* YOUR CODE HERE */ })

You could do the same with promises, but I don't see why.

natanael
  • 230
  • 1
  • 7
2

I took a stab at it. You need to install the q library and require it to

var Q = require('q');

//... where ever your function is
//start with an array of string urls
var urls = [ "http://Blablablabla.com", '...', '...'];

//store results in this array in the form:
//  { 
//       url: url, 
//       promise: <will be resolved when its done>, 
//       share:'code that you wanted'
//    }
var results = [];

//loop over each url and perform the request
urls.forEach(processUrl);

function processUrl(url) {
  //we use deferred object so we can know when the request is done
  var deferred = Q.defer();

  //create a new result object and add it to results
  var result = {
    url: url,
    promise: deferred.promise
  };
  results.push(result);


  //perform the request
  request(url, function (error, response, body) {
      if (!error) {
        var $ = cheerio.load(body),
          share = $(".theitemIwant").html();
        //resolve the promise so we know this request is done.
        //  no one is using the resolve, but if they were they would get the result of share
        deferred.resolve(share);
        //set the value we extracted to the results object
        result.share = share;
      } else {

        //request failed, reject the promise to abort the chain and fall into the "catch" block
        deferred.reject(error)
        console.log("We've encountered an error: " + error);
      }
  });
}

//results.map, converts the "array" to just promises
//Q.all takes in an array of promises
//when they are all done it rull call your then/catch block.
Q.all(results.map(function(i){i.promise}))
    .then(sendResponse) //when all promises are done it calls this
    .catch(sendError);  //if any promise fails it calls this

 function sendError(error){
   res.status(500).json({failed: error});
 }
 function sendResponse(data){ //data = response from every resolve call
  //process results and convert to your response
  return res.send(results);
}
Nix
  • 57,072
  • 29
  • 149
  • 198
  • Ok thanks a lot It's working fine. Why are you using Q specifically ? – Simon Breton Sep 22 '16 at 22:02
  • I'm not that familiar with native promises, so i generally use `q` as my goto. It allows you to create a promise, but also an easy way to check if an "array" of promises is done. – Nix Sep 22 '16 at 22:05
  • 1
    Would it be too much to ask to put some commentary on each part of the code. I got the whole idea of promise but I've really hard time to really understand how all this works. – Simon Breton Sep 22 '16 at 22:07
  • 2
    No sir, if i need comments i didn't name stuff good enough ;) ill update. – Nix Sep 22 '16 at 22:08
0

Here is another solution I like a lot :

const requestPromise = require('request-promise');
const Promise = require('bluebird');
const cheerio = require('cheerio');

const urls = ['http://google.be', 'http://biiinge.konbini.com/series/au-dela-des-murs-serie-herve-hadmar-marc-herpoux-critique/?src=konbini_home']

Promise.map(urls, requestPromise)
  .map((htmlOnePage, index) => {
    const $ = cheerio.load(htmlOnePage);
    const share = $('.nb-shares').html();
    let shareTuple = {};
    shareTuple[urls[index]] = share;
    return shareTuple;
  })
  .then(console.log)
  .catch((e) => console.log('We encountered an error' + e));
Simon Breton
  • 2,638
  • 7
  • 50
  • 105