0

I'm new at Node.js and promises (here, I'm using Q.js). I'm trying to make a scraper out of a site which has the following structure:

  • main_page: has a list of categories, each category has a link that points to a list of stores page.
  • list of stores page: has a list of stores, each store has a link that points to a store details page.
  • store detail page: has the data that I'm looking for.

I made a first working approach without promises, but as a result the code was very ugly. So I think this is a good case to use promises.

I cannot get this approach to work. When the second loop is done, the app doesn't continue (it never executes the end() method). Besides, I don't know how to attach the third loop.

How could I do it?

function get(url) {
    var deferred = Q.defer();
    requestify.get(url).then(function(response) {
        deferred.resolve(cheerio.load(response.getBody()));
    });
    return deferred.promise;
}

function process_main_page($) {
    var promises = [];
    $('.categories a').each(function(i) {
        var deferred = Q.defer();
        var storesList = $('.store');
        get($(this).attr('href')).then(function($) {
            deferred.resolve(process_stores_list(storesList));
        });
        promises.push(deferred);
    });
    return Q.all(promises);
}

function process_stores_list(storesList) {
    var promises = [];
    storesList.each(function() {

        // Here I need to make another ajax call for each store detail page, which has the data that I need.

        promises.push(deferred);
    });
    return Q.all(promises);
}

function end(res) {
    var deferred = Q.defer();
    fs.writeFile('output.json', JSON.stringify(myGatheredData, null, 4), function(err) {
        deferred.resolve(function() {
            res.send('File successfully written! - Check your project directory for the output.json file');
        });
    });
    return deferred.promise;
}

app.get('/', function(req, res) {
    get(url).then(process_main_page).then(end);
});
sgress454
  • 24,870
  • 4
  • 74
  • 92
Agorreca
  • 684
  • 16
  • 31

1 Answers1

2

As @BenjaminGruenbaum already commented, your code is littered with the deferred antipattern. The only (more or less) legitimate use of Q.defer() is for the fs.writeFile, but you forgot to handle errors there. It's easier to just promisify that API.

I cannot get this approach to work.

The overall structure seems to be fine. However, some points:

  • You never seem to be fetching the stores_list from the page with the stores. You fetch that page, but resolve the promise with var storesList = $('.store'); from the category page?
  • your end method does get the myGatheredData - the array of results joined by Q.all - passed as its argument. It does not have any access to the response object.

When the second loop is done, the app doesn't continue (it never executes the end() method). Besides, I don't know how to attach the third loop.

I think that's the reason - you probably were already constructing the deferreds for the Q.all() array, but never resolved them. That made the returned promise "hang" (stay pending), and the end callback was never called.

var write = Q.nbind(fs.writeFile, fs);
function get(url) {
    return requestify.get(url).then(function(response) {
        return cheerio.load(response.getBody()));
    });
}

function process_main_page($_main) {
    var promises = $_main('.categories a').map(function(i) {
        // var storesList = $_main('.store'); // not sure what this did
        return get($_main(this).attr('href')).then(process_storelist_page);
    }).toArray();
    return Q.all(promises);
}
function process_storelist_page($_stores) {
    return process_stores_list($_stores('a.store').map(function() {
        return $_stores(this).attr('href'); // whatever?
    }).toArray());
}

function process_stores_list(storesList) {
    var promises = $.map(storesList, function(store_url) {
        // Here make another ajax call for each store detail page
        return get(store_url).then(process_store_page););
    });
    return Q.all(promises);
}
function process_store_page($_store) { // which has the data that I need.
    return /* select some data from the page */;
}
function save_data(myGatheredData) {
    return write('output.json', JSON.stringify(myGatheredData, null, 4)).then(function() {
        return 'File successfully written! - Check your project directory for the output.json file';
        });
    });
}

app.get('/', function(req, res) {
    get(url).then(process_main_page).then(save_data).then(function end(result) {
        res.send(result);
    });
});

Of course you could also just nest everything with function expressions instead of the function declarations I have used.

Community
  • 1
  • 1
Bergi
  • 630,263
  • 148
  • 957
  • 1,375