I use Phantomjs to scrape websites that use JavaScript and Ajax to load dynamic content.
I have the following code:
var page = require('webpage').create();
page.onError = function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
};
page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};
page.open('http://www.betexplorer.com/soccer/germany/oberliga-bayern-sud/wolfratshausen-unterhaching-ii/x8rBMAB8/', function () {
console.log(page.content);
phantom.exit();
});
The problem is that this code doesn't retrieve the source code i want.
If you enter the URL through a web browser(like chrome) and read the source code(the dynamic source code, after the JavaScript and Ajax calls were made) of the page, you will see that the web browser source code and the Phantomjs source code are completely different.
But in this case i need the web browsers source code.
Usually this Phantomjs code retrieves the source code i need, but in the case of this url(any many others) Phantomjs does not retrieve the correct source code.
I assume Phantomjs doesn't know how to handle the JavaScript and Ajax calls that load dynamic content into this page.
I get these errors when i run the code:
ERROR: TypeError: 'undefined' is not a function (evaluating 'function(e){
this.pointer.x = e.pageX;
this.pointer.y = e.pageY;
}.bind(this)')
TRACE:
-> http://www.betexplorer.com/gres/tooltip.js?serial=1410131213: 207
-> http://www.betexplorer.com/gres/tooltip.js?serial=1410131213: 157
-> http://www.betexplorer.com/gres/tooltip.js?serial=1410131213: 310 (in function "tooltip")
-> http://www.betexplorer.com/soccer/germany/oberliga-bayern-sud/wolfratshausen-unterhaching-ii/x8rBMAB8/: 291
-> http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js: 2
-> http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js: 2
-> http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js: 2
-> http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js: 2
CONSOLE: Invalid App Id: Must be a number or numeric string representing the application id. (from line #undefined in "undefined")
CONSOLE: FB.getLoginStatus() called before calling FB.init(). (from line #undefined in "undefined")
So how do i get the dynamic source code of this page(http://www.betexplorer.com/soccer/germany/oberliga-bayern-sud/wolfratshausen-unterhaching-ii/x8rBMAB8/) using Phantomjs?