0

I am building a simple web scraper. I am trying to scrape every link with the class name .pro-title in this url - http://www.home.com/professionals/c/oho,-TN. I don't understand why the thenOpen() function is executing twice.

var casper = require('casper').create({
    logLevel:"verbose",
    debug:true
});

var links;
var name;
var paragraph;
var firstName;

casper.start('http://www.home.com/professionals/c/oho,-TN');

casper.then(function getLinks(){
     links = this.evaluate(function(){
        var links = document.getElementsByClassName('pro-title');
        links = Array.prototype.map.call(links,function(link){
            return link.getAttribute('href');
            //this.echo(links);
        });
        return links;
    });
});
casper.then(function(){
    this.each(links,function(self,link){
        self.thenOpen(link,function(a){
            //this.echo(this.getCurrentUrl());
            // this.echo(this.getCurrentUrl());
            //this.echo("first");
            var firstName = this.fetchText('div.info-list-text');
            this.echo(firstName);
        });
    });
});
casper.run(function(){
    this.exit();
});
  • 1
    It probably has something to do with the links where only every second `a` element contains a usable href attribute. You will need to find out how to recover the actual hyperlink. – Artjom B. Feb 04 '16 at 21:20

1 Answers1

1

Artjom B was correct in that you were gathering href attributes that weren't valid URLs. You can eliminate them using a regular expression.

var casper = require('casper').create({
    logLevel:"verbose",
    debug:true
});

var links;
var name;
var paragraph;
var firstName;
var expression = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(expression);

casper.start('http://www.houzz.com/professionals/c/Nashville,-TN');

casper.then(function getLinks(){
     links = this.evaluate(function(){
        var links = document.getElementsByClassName('pro-title');
        links = Array.prototype.map.call(links,function(link){
            return link.getAttribute('href');
        });
        return links;
    });
});
casper.then(function(){
    this.each(links,function(self,link){
      if (link.match(regex)) {
        self.thenOpen(link,function(a){
          var firstName = this.fetchText('div.info-list-text');
          this.echo(firstName);
        });
      }
    });
});
casper.run(function(){
    this.exit();
});
medinasod
  • 375
  • 3
  • 9
  • Thank you. That was very helpful. This outputs data from 15 links on the page, but there are 30 links (The ones that are not valid URL's). Is there a way I can extract data from the remaining links? –  Feb 05 '16 at 15:45
  • The same data I am extracting in the code above (`Contact, location and the type of business they do`). So, basically I need to find a way to open the URL's that are not valid. –  Feb 05 '16 at 15:53
  • Do do that you should click and follow all the links. This method might help with that: http://stackoverflow.com/questions/14593449/click-on-all-links-matching-a-selector – medinasod Feb 05 '16 at 17:25