I was recently building a scraper module to get some information with nodejs until I encountered this "little" problem. The modules that I'm using are cheeriojs and request. Actually the module works like a charm if I call only one method at a time. It contains three function and only two of them are exported, this is the code:
'use strict';
var request = require('request'),
cheerio = require('cheerio'),
counter = 0;
function find(term, cat, callback) {
// All the check for the parameters
scrape("http://.../search.php?search=" + encodeURIComponent(term), cat, callback);
}
function last(cat, callback) {
// All the check for the parameters
scrape("http://google.com/", cat, callback);
}
function scrape(url, cat, callback) {
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
var result = [];
var items = $('.foo, .foo2').filter(function() {
// Condition to filter the resulted items
});
items.each(function(i, row) {
// Had to do another request inside here to scrape other information
request( $(".newpagelink").attr("href"), function(error, response, body) {
var name = $(".selector").text(),
surname = $(".selector2").text(),
link = cheerio.load(body)('.magnet').attr('href'); // This is the only thing that I'm scraping from the new page, the rest comes from the other "cheerio.load"
// Push an object in the array
result.push( { "name": name, "surname": surname, "link": link } );
// To check when the async requests are ended
counter++;
if(counter == items.length-1) {
callback(null, result);
}
});
});
}
});
}
exports.find = find;
exports.last = last;
The problem now, as I was saying, is that if I create a new node script "test.js" and I call only last OR find, it works perfectly! But if I call both the methods consecutively like this:
var mod = require("../index-tmp.js");
mod.find("bla", "blabla", function(err, data) {
if (err) throw err;
console.log(data.length + " find");
});
mod.last(function(err, data) {
console.log(data.length + " last");
});
The results are completely messed up, sometimes the script doesn't even print something, other times print the result of only "find" or "last", and other times returns a cheeriojs error (I won't add here to not mess you up, because probably it's my script's fault). I thought also to repeat the same function two times for both the methods but nothing, the same problems occur... I don't know what else to try, I hope you'll tell me the cause of this behavior!