There are sites whose DOM and contents are generated dynamically when the page loads. (Angularjs-based sites are notorious for this)
What approach do you use? I tried both phantomjs and jsdom but it seems I am unable get the page to execute its javascript before I scrape.
Here's a simple jsdom example (not angularjs-based but still dynamically generated)
var env = require('jsdom').env;
exports.scrape = function(link, callback) {
var config = {
url: link,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.110 Safari/537.36'
},
done: jsdomDone
};
env(config);
}
function jsdomDone(err, window) {
var info = null;
if(err) {
console.error(err);
} else {
var $ = require('jquery')(window);
console.log($('.profilePic').attr('src'));
}
}
exports.scrape('https://www.facebook.com/elcompanies');
I tried phantomjs with moderate success.
var page = new WebPage()
var fs = require('fs');
page.onLoadFinished = function() {
console.log("page load finished");
window.setTimeout(function() {
page.render('export.png');
fs.write('1.html', page.content, 'w');
phantom.exit();
}, 10000);
};
page.open("https://www.facebook.com/elcompanies", function() {
page.evaluate(function() {
});
});
Here I wait for the onLoadFinished event and even put a 10-second timer. The interesting thing is that while my export.png image capture of the page shows a fully rendered page, my 1.html doesn't show the .profilePic class element in its rightful place. It seems to be sitting in some javascript code, surrounded by some kind of "require("TimeSlice").guard(function() {bigPipe.onPageletArrive({..." block
If you can provide me a working example that scrapes the image off this page, that'd be helpful.