I'm trying to create a tool to scrape information off a web page (yes, I have permission).
So far, I have been using Node.js with requests and cheerio to pull the pages and then find the information based on CSS selectors. I have done enough debugging to know that the script is definitely getting the information from the pages successfully.
What appears to be happening is that the code after the for-loop is being performed first or maybe too quickly after the calls and the requests can't finish. I'm not entirely certain how the JS call stack works.
My source code appears as follows:
var baseURL = 'http://www2.dailyfaceoff.com/teams/lines/';
var request = require('request'),
cheerio = require('cheerio'),
urls = [],
teams = [];
var teamPages = [13, 14, 15, 16, 17, 18, 19, 20, 21,
21, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42]
for(i in teamPages)
{
url = baseURL + teamPages[i];
urls.push(url);
}
for(u in urls)
{
var team = [];
request(urls[u], function(err, resp, body)
{
if(!err && resp.statusCode == 200){
var $ = cheerio.load(body);
var teamName = $('#newTitle').text();
var players = [];
$('#forwards td a img').each(function(){
var name = $(this).attr("alt");
players.push(name); });
$('#defense td a img').each(function(){
var name = $(this).attr("alt");
players.push(name); });
$('#goalie_list td a img').each(function(){
var name = $(this).attr("alt");
players.push(name); });
//console.log(players);
teams.push(players);
}
});
}
console.log(teams);
console.log('DONE');