2

I'm new to Node.js. I'm using zombie.js to scrape a web page title from a few websites. Below is my code:

var Browser = require("zombie");
var util = require("util");
halt = require('delayed');
title = [];
url = [ 'http://www.apple.com', 'http://www.microsoft.com', 'http://www.dell.com' ];


function getTitles(url){
    //console.log('Start scraping title');
    var length = url.length;
    console.log('Total Site to Scrape: '+length);
    label = 1;
    for(var i=0;i<length;i++){
        browser = new Browser()
        browser.runScripts = false
        browser.setMaxListeners(0);
        browser.visit(url[i], function(e, browser, status, errors) {
        browser.wait(function(){
            title[i] = browser.text('html > head > title');
            console.log(label+': '+title[i]);
            browser.close();
            label++;
        });
        });
    };  
}


getTitles(url);

halt.delay(function () {
    console.log('Array Length: '+title.length)
    console.log('Array Content: '+title)
}, 10)

Below is the output of the code:

Total Site to Scrape: 3
1: Apple
2: Dell Official Site - The Power To Do More | Dell
3: Microsoft Home Page | Devices and Services
Array Length: 4
Array Content: ,,,Microsoft Home Page | Devices and Services

The part that I don't understand:

  1. Why is the array length returned 4 instead of 3? There are only three urls
  2. Why is the array content only return the last element? Where are the other two missing elements?
Caracos
  • 31
  • 5

1 Answers1

1

I'm not familiar with zombie but I am pretty sure this is a closure problem. i isn't what you think it is when you think it is. See this answer: JavaScript closure inside loops – simple practical example for some information on closures. Basically what's happening is your loop continues even though your requests are async, they're not done so when they come back, you've looped through all 3 URLS, now you've got three elements... well then it inserts all 3 values into the 4th element. Last one wins, so Microsoft is all you see.

Community
  • 1
  • 1
Jonathan Rowny
  • 7,588
  • 1
  • 18
  • 26
  • Sorry for being a bit thick. Based on the top vote answer for the link that you gave me, the content of the funcs array is still not the actual value but instead: Array Content: function () { console.log("My value: " + i); },function () { console.log("My value: " + i); },function () { console.log("My value: " + i); } – Caracos Nov 02 '12 at 23:41