0

I've a page like this one:

<html>
  <body>
    <table>
      <thead>
        <tr>
          <th>Link</th><th>Description</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><a href="https://www.google.com">Google</a></td><td>Search engine</td>
        </tr>
        <tr>
          <td><a href="https://github.com">Github</a></td><td>Code management</td>
        </tr>
      </tbody>
    </table>
  </body>
</html>

I would like to parse every row of the table and follow each link (to get the HTML's page title) to create an array of sites like this one:

[ { name: 'Google',
title: 'Google',
descr: 'Search engine' },
  { name: 'Github',
title: 'GitHub ยท Where software is built',
descr: 'Code management' } ]

I thought this is a good example to start learning using Promises and the Q library, but I failed to grasp how promises works. Below the code that I wrote:

var request = require('request');
var cheerio = require('cheerio');
var Q = require('q');

var sites = [];

var loadPage = function(url){
  var deferred = Q.defer();

  request(url, function (error, response, html) {
if (!error && response.statusCode == 200) {
  var $ = cheerio.load(html);
  deferred.resolve($);
} else {
  deferred.reject(new Error(error));
}
  });
  return deferred.promise;
}

var parseRows = function($){
  var promises = [];

  $("tbody tr").each(function(){
  var $cells = $('td', this);

  var $firstC = $cells.eq(0);
  var name  = $firstC.text();
  var link  = $firstC.find('a').attr('href');
  var descr = $cells.eq(1).text();
  promises.push(Q.fcall(function () {
      var site = {name: name, descr: descr};
      loadPage(link).then(function($){
        var title = $("title").text(); 
        console.log(title);
        // here I don't know how to set the title
        // as obj's attribute
       });
       return site;
     }));
  });
  return Q.all(promises);
}

var displayTitles = function(res){
  for (var i = 0, len = res.length; i < len; i++) {

    var obj = res[i];
  }
  return Q.fcall(function () {
    return sites;
  });
}

loadPage('http://127.0.0.1/sample.html')
  .then(parseRows)
  .then(displayTitles)
  .done();

I'm satisfied with loadPage function but I'm stuck with parseRows, beacuse I'm not able to set the title as property of "site" object. Moreover displayTitles was initially developed to handle the logic required to get the page's title, but now is almost useless.

How can I modify the code above in order to get the desired array as output in a more clean and readable way?

alessmar
  • 4,689
  • 7
  • 43
  • 52
  • This question appears to be off-topic because it belongs on [Code Review](http://codereview.stackexchange.com). โ€“ Evan Davis Nov 20 '14 at 20:53
  • Eh, I might be wrong; can you clean this up into a single problem statement with a self-contained example? You've got a lot of cruft here. โ€“ Evan Davis Nov 20 '14 at 20:54
  • possible duplicate of [How to sequentially run promises with Q in Javascript?](http://stackoverflow.com/q/18386753/1048572) โ€“ Bergi Feb 10 '16 at 15:44

2 Answers2

0

I think your main problem is that Q.fcall is being resolved instantly, not after pageLoad has been resolved. A little restructuring should help:

var promises = [];

// ...

$("tbody tr").each(function(){

// ..

  promises.push(loadPage(link).then(function($){
    var site = {name: name, descr: descr};
    site.title = $("title").text(); 
    return site;
  }));

});
return Q.all(promises);

As to how you can further condense the code, you could try this:

var parseRows = function ($) {
  return Q.all($("tbody tr").map(function () {
    var $cells = $('td', this);
    var $firstC = $cells.eq(0);
    var name = $firstC.text();
    var link = $firstC.find('a').attr('href');
    var descr = $cells.eq(1).text();

    return loadPage(link).then(function ($) {
      // are you sure there is a TITLE element? Did you perhaps mean .title?
      return {name: name, descr: descr, title: $("title").text()};
    });
  });
};

I don't know what you want to achive within the displayTitles function, so I can't help there. But I'm pretty sure you don't need the extra Q.fcall wrapper. According to the Promises (as far as i remember) you should be able to simply do retrun sites to resolve. Also, PERSONALLY, I'd stick with the native Promises API, which is supported in all recent browsers but IE (http://caniuse.com/#search=promise) but it seems you're using node anyways.

lordvlad
  • 5,200
  • 1
  • 24
  • 44
0

After playing a bit with Q framework I decided follow @lordvald advice and I switched to native Promises API. Below there's the code that answers my question:

var request = require('request')
var cheerio = require('cheerio')

var loadPage = function(url) {
    var promise = new Promise(function(resolve, reject) {
        request(url, function(error, response, html) {
            if (!error && response.statusCode == 200) {
                resolve(cheerio.load(html))
            } else {
                reject(new Error(error))
            }
        })
    })
    return promise
}

var parseRows = function($) {
    return $('tbody tr').map(function() {
        var $cells = $('td', this)
        var firstC = $cells.eq(0)
        var url = firstC.eq(0).find("a").attr("href")
        return {
            name: firstC.text(),
            descr: $cells.eq(1).text(),
            url: url
        }
    }).get()
}

var loadSiteTitle = function(sites) {
    return Promise.all(sites.map(function(site) {
        return loadPage(site.url).then(function($) {
            site.title = $("title").text()
            delete site.url
            return site
        })
    }))
}

loadPage('http://127.0.0.1/sample.html')
    .then(parseRows)
    .then(loadSiteTitle)
    .then(function(sites) {
        console.log(sites)
    })
    .catch(function(e) {
        console.log('Unexpected error: ' + e.message)
        process.exit(1)
    })
alessmar
  • 4,689
  • 7
  • 43
  • 52