1

I have some trouble with my crawler in node JS. Indeed, I have an error when I lunch my crawler for the Vinted website. But i have this error a lot of time : first i have this :

Error: getaddrinfo ENOTFOUND www.vinted.fr www.vinted.fr:443

Then

... Error: read ECONNRESET Error: read ECONNRESET Error: read ECONNRESET Error: read ECONNRESET Error: read ECONNRESET ...

and sometimes

Error: socket hang up
 Error: socket hang up
 Error: socket hang up

But my crawler works and return me the good results for some products and stop after 10 min about. I think is because i send too much resquest but i need it ... So, its probably a network issue

I am completely stuck with all these error Its possible to fix it ?

Thanks for your help.

Here my code :

fs.readFile(__dirname +'/link.json', 'utf8', function (err, data) {
        var obj;
        if (err) throw err;
        obj = JSON.parse(data);
        urlp = obj.link;
        console.log(colors.yellow("Products:"+urlp.length));
          for(i = 1; i < urlp.length-1; i++){

            url = 'https://www.vinted.fr'+urlp[i-1];
            request(url, function(error, response, html){

            if(!error){
                  var $ = cheerio.load(html);
                  var link = [];
                  var json = { link : ""};
                  var price = $('span[itemprop=price]').text();
                  var format_price = price.replace(/\n|\r/g,""); 
                  var format_price2 = format_price.replace(/ /g,"");
                  var res1 = $('.details-list--details');
                  var meta = $("link[rel='canonical']").attr('href');
                  var images = []; // tableau img


                  $('img[itemprop=image]').filter(function(){
                      var img = $(this).attr('data-src');;
                      images.push(img);
                  })
                //  var imageshow = console.log(colors.rainbow(images .join(", ")));

                  var brand = $('.inverse > [itemprop=name]').text();
                  var state = $('div[itemprop=itemCondition]').text(); 
                  var color =  $('div[itemprop=color]').text(); 
                  console.log(urlp[i]);
                  var token_vendu = $('.state-bar').text();
                  if(token_vendu != ""){ 
                    console.log(colors.red('PRODUIT VENDU'));
                    var vendu = 1; 
                  }else{
                    vendu = 0;
                  }
                  console.log(colors.blue("CallBack Vente "+vendu));

                  var discount_price = $('.old-price').text(); 
                  console.log("Discount: " + discount_price);
                try{
                  if(brand == ""){ 

                    var size = res1.children().parent().text();
                    var format_size = size.replace(/ /g,"");
                    var format_size2 = format_size.replace(/[\n]/gi, " " );
                    var split_size1 = format_size2.split("    ");
                    var split_size2 = split_size1[0].split(" ");
                    var split4 = split_size2[4];
                    var formatsize = split4;

                  }else{

                    var size = res1.children().parent().children().text(); 
                    var format_size = size.replace(/ /g,"");
                    var format_size2 = format_size.replace(/[\n]/gi, " " );
                    var split_size = format_size2.split("         ");
                    console.log("split: "+split_size[1] )
                    var split3 = split_size[1].split(" ");
                    formatsize = split3[1];


                  }

                } catch (e) {
                    split_size[1] = "N/A";
                    console.log(e.message);
                    }
                  console.log("Size : " + formatsize);
                  console.log("Brand : "+brand);
                  console.log(meta);
                  console.log("color : " + color);
                  console.log("state : " + state);

                  //Save to database
                 connection.query('INSERT INTO `vinted` VALUES ( NULL , ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP())',
                      [brand,
                      color,
                      format_price2,
                      discount_price,
                      state,
                      formatsize,
                      vendu,
                      images.join(", "),
                      meta
                      ]
                  , function (err, result) {
                      if (err) {console.error('error inserting into database : ' + err.stack); return;}
                  });
                }else{console.log(colors.red(error));} // here the error ECONNRESET/ENOTFOUND ...
Split Evo
  • 15
  • 7
  • 1
    It might help if you add some code sample. Read more about the causes of ECONNRESET here: http://stackoverflow.com/questions/17245881/node-js-econnreset?rq=1. – kennasoft Aug 25 '16 at 12:08
  • Thanks for your answer i already saw this post but there is no solution for me I add more code in my last post – Split Evo Aug 25 '16 at 12:16
  • 1
    Oh man! now I see why there'd be such a response. You just iterating through the list of urls and sending out requests at the speed of the computer. Better you do it one after the other, as suggested in my answer. OR when I have the time, I could modify this your code for you. – kennasoft Aug 25 '16 at 12:41
  • I dont see how can i do that .. if you have an examples. How can i execute my request one after one ? I can use the @kennasoft methods ? – Split Evo Aug 25 '16 at 12:46
  • Take a look at how I use `EventEmitter` in my answer, and see if you can refactor your code to do that, or you wait till EOD, when I can do the same with your sample code. It's busy at the office right now! – kennasoft Aug 25 '16 at 12:50

1 Answers1

0

My suspicion is that your crawler may be overwhelming the site you are trying to crawl, hence the ECONNRESET. I had a similar experience writing a node.js crawler myself. I had to self-throttle by sending my requests at timed intervals, to give the server some breathing space.

Example:

var request = require('request');
var EventEmitter = require('events').EventEmitter;

emitter = new EventEmitter();
function doCrawl(){
      setTimeout(function(){
        //do crawling operation, e.g.
        request(url, function(err, resp, html){
          if(!err){
            //do all you want with the response then trigger another one
            emitter.emit('fetchNext');
          }
        });
      }, 60000);
    }
}

emitter.on('fetchNext', doCrawl);

You may also want to consider doing this

EDIT>> Using your sample code

var urlStack = []; //an array that holds the list of urls you want to visit
var emitter = new EventEmitter();
emitter.on('fetchNext', delayedCrawl); //this is triggered after any item is saved

fs.readFile(__dirname + '/link.json', 'utf8', function(err, data) {
      var obj;
      if (err) throw err;
      obj = JSON.parse(data);
      urlp = obj.link;
      console.log(colors.yellow("Products:" + urlp.length));

      for (i = 1; i < urlp.length - 1; i++) {
        urlStack.push('https://www.vinted.fr' + urlp[i - 1];
        }
      emmiter.emit('fetchNext');
});
    function delayedCrawl(){
        setTimeout(doCrawl, 5000); //5-second delay
    }

    function doCrawl() {
      var url = urlStack.pop();
      if(!url) return;
      request(url, function(error, response, html) {

            if (!error) {
              var $ = cheerio.load(html);
              var link = [];
              var json = {
                link: ""
              };
              var price = $('span[itemprop=price]').text();
              var format_price = price.replace(/\n|\r/g, "");
              var format_price2 = format_price.replace(/ /g, "");
              var res1 = $('.details-list--details');
              var meta = $("link[rel='canonical']").attr('href');
              var images = []; // tableau img


              $('img[itemprop=image]').filter(function() {
                  var img = $(this).attr('data-src');;
                  images.push(img);
                })
                //  var imageshow = console.log(colors.rainbow(images .join(", ")));

              var brand = $('.inverse > [itemprop=name]').text();
              var state = $('div[itemprop=itemCondition]').text();
              var color = $('div[itemprop=color]').text();
              console.log(url);
              var token_vendu = $('.state-bar').text();
              if (token_vendu != "") {
                console.log(colors.red('PRODUIT VENDU'));
                var vendu = 1;
              } else {
                vendu = 0;
              }
              console.log(colors.blue("CallBack Vente " + vendu));

              var discount_price = $('.old-price').text();
              console.log("Discount: " + discount_price);
              try {
                if (brand == "") {

                  var size = res1.children().parent().text();
                  var format_size = size.replace(/ /g, "");
                  var format_size2 = format_size.replace(/[\n]/gi, " ");
                  var split_size1 = format_size2.split("    ");
                  var split_size2 = split_size1[0].split(" ");
                  var split4 = split_size2[4];
                  var formatsize = split4;

                } else {

                  var size = res1.children().parent().children().text();
                  var format_size = size.replace(/ /g, "");
                  var format_size2 = format_size.replace(/[\n]/gi, " ");
                  var split_size = format_size2.split("         ");
                  console.log("split: " + split_size[1])
                  var split3 = split_size[1].split(" ");
                  formatsize = split3[1];


                }

              } catch (e) {
                split_size[1] = "N/A";
                console.log(e.message);
              }
              console.log("Size : " + formatsize);
              console.log("Brand : " + brand);
              console.log(meta);
              console.log("color : " + color);
              console.log("state : " + state);

              //Save to database
              connection.query('INSERT INTO `vinted` VALUES ( NULL , ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP())', [brand,
                color,
                format_price2,
                discount_price,
                state,
                formatsize,
                vendu,
                images.join(", "),
                meta
              ], function(err, result) {
                emitter.emit('fetchNext');
                if (err) {
                  console.error('error inserting into database : ' + err.stack);
                  return;
                }
              });
            } else {
              console.log(colors.red(error));
            } // here the error ECONNRESET/ENOTFOUND ...
   
kennasoft
  • 1,595
  • 1
  • 14
  • 26
  • ok thanks you but your app var is not defined like me ... I have this personally `var express = require('express'), app = express(), server = require('http').createServer(app), io = require('socket.io'), fs = require('fs'), Crawler = require("crawler"), mysql = require('mysql'), request = require('request'), cheerio = require('cheerio'), prompt = require('prompt'), ExpressBrute = require('express-brute'), EventEmitter = require('events').EventEmitter;` – Split Evo Aug 25 '16 at 12:55
  • I've edited the answer to remove all the confusion with app and all that. Just the function and the logic around it. – kennasoft Aug 25 '16 at 13:11
  • so i must put all my switch in //do crawling operation, e.g. part ? I have more than one request(url, function(err, resp, html){...} – Split Evo Aug 25 '16 at 13:19
  • If this solves your problem, you can thank me by accepting the answer. – kennasoft Aug 25 '16 at 15:01
  • Its Working ! Thanks you very much :) – Split Evo Aug 25 '16 at 15:12