0

I'm trying to create a tool to scrape information off a web page (yes, I have permission).

So far, I have been using Node.js with requests and cheerio to pull the pages and then find the information based on CSS selectors. I have done enough debugging to know that the script is definitely getting the information from the pages successfully.

What appears to be happening is that the code after the for-loop is being performed first or maybe too quickly after the calls and the requests can't finish. I'm not entirely certain how the JS call stack works.

My source code appears as follows:

var baseURL = 'http://www2.dailyfaceoff.com/teams/lines/';
var request = require('request'), 
    cheerio = require('cheerio'),
    urls = [],
    teams = [];


var teamPages = [13, 14, 15, 16, 17, 18, 19, 20, 21,
 21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 
 35, 36, 37, 38, 39, 40, 41, 42]

 for(i in teamPages)
 {
    url = baseURL + teamPages[i];
    urls.push(url);
 }

 for(u in urls)
 {
    var team  = [];
    request(urls[u], function(err, resp, body)
      {
        if(!err && resp.statusCode == 200){
            var $ = cheerio.load(body);         
            var teamName = $('#newTitle').text();
            var players = [];
            $('#forwards td a img').each(function(){
                var name = $(this).attr("alt");
                players.push(name); });
            $('#defense td a img').each(function(){
                var name = $(this).attr("alt");
                players.push(name); });
            $('#goalie_list td a img').each(function(){
                var name = $(this).attr("alt");
                players.push(name); });
            //console.log(players);
            teams.push(players);    
        }
      });
 }
 console.log(teams);
 console.log('DONE');
Neurax
  • 3,657
  • 2
  • 13
  • 18
  • Do you have any idea what an asynchronous response is and what that means for how you write code that uses them? If not, you need to do some reading because your current approach cannot work. You may also want to read this: http://stackoverflow.com/questions/14220321/how-to-return-the-response-from-an-ajax-call – jfriend00 Nov 26 '14 at 05:35

1 Answers1

1

Something seems odd, Node.js is based on event-driven non-blocking model. So you need to be careful when using loop blocks like for with asynchronous calls. Try using forEach and give it a function handler. Also, print the result only when you are sure that all the requests have been fulfilled. The below code might help you, but its still not 100% correct/pretty:

urls.forEach( function (url, index) {
  var team  = [];
  request(u, function(err, resp, body)
  {
     if(!err && resp.statusCode == 200){
        .
        .
        teams.push(players);

        // Print the teams when last response is done
        if ( index == urls.length - 1 )
          console.log(teams);
     }
  });
}
Vageesh Bhasin
  • 553
  • 2
  • 12
  • Thank you, this is exactly what I thought the issue was.. I was initially looking at an asynchronous model using for each (u in urls) but I was running into similar issues. Thank you again. – Neurax Nov 26 '14 at 05:56
  • It takes time getting used to handling asynchronous code. Keep up the learning! And you are welcome, happy to help! :) – Vageesh Bhasin Nov 26 '14 at 05:57