0

I am scraping a website that is using React components, using PhantomJS in Nodejs.

With this: https://github.com/amir20/phantomjs-node

Here is the code:

phantom.create().then(ph => {
    _ph = ph;
    return _ph.createPage();
}).then(page => {
    _page = page;
    return _page.open(url);
}).then(status => {
    return _page.property('content');
}).then(content => {
    console.log(content);
    _page.close();
    _ph.exit();
}).catch(e => console.log(e));

Problem is the react content is not rendered, it only says: <!-- react-empty: 1 -->" where the actual react component should be loaded.

How can I scrap the rendered react component? I initially switched from a pure node-request solution to PhantomJS to fix this but now I am stuck.


UPDATE:

So I dont have a real solution yet. I switched to NightmareJS (https://github.com/segmentio/nightmare) which has a nice .wait('.some-selector') function, which waits till the specified selector is loaded. This fixed my problems with dynamically loaded react components.

DennisKo
  • 263
  • 2
  • 6
  • 19
  • Does this package support receiving page.onError callbacks? Are there any errors? – Vaviloff Nov 05 '16 at 17:07
  • Yes, thereis `console.log(status);` and it returns success. I get the whole HTML content except the react components where I get `` – DennisKo Nov 05 '16 at 17:27
  • I suspect `status` if from `page.open` callback and you need to check for errors in page.onError callback. Could be something like this: [Why I am not able to render my ReactJS application using PhantomJS](http://stackoverflow.com/questions/38469005/why-i-am-not-able-to-render-my-reactjs-application-using-phantomjs-2-1-1) – Vaviloff Nov 06 '16 at 07:28

1 Answers1

3

I think you should wait for rendering the react elements on the page after the page is loaded. An example of such a waiting-function, using Q promises, is below. This function returns a promise and checks for page state every 50ms. If the required page state is reached, the function resolves the promise. In the case of timeout, the function rejects the promise.

var phantom = require('phantom');
var Q = require('q');
var _ph, _page, _outObj;
var url = 'https://tech.yandex.ru/maps/jsbox/';

phantom.create().then(ph => {
    _ph = ph;
    return _ph.createPage();
}).then(page => {
    _page = page;
    return _page.open(url);
}).then(status => {
    console.log(status);
    return waitState(textPopulated, 3);
}).then(() => {
    return _page.property('content');
}).then(content => {
    console.log(content);
_page.close();
_ph.exit();
}).catch(e => console.log(e));

function textPopulated() {
    return _page.evaluate(function() {
        var layer = document.querySelector('.ace_text-layer');
        return layer && layer.childElementCount;
    }).then(function(childElementCount) {
        console.log('childElementCount: ' + childElementCount);
        return childElementCount > 0;
    });
}

function waitState(state, timeout) {  // timeout in seconds is optional
    console.log('Start waiting for state: ' + state.name);

    var limitTime = timeout * 1000 || 20000;
    var startTime = new Date();

    return wait();

    function wait() {
        return state().then(function(result) {
            if (result) {
                console.log('Reached state: ' + state.name);
                return;
            } else if (new Date() - startTime > limitTime) {
                var errorMessage = 'Timeout state: ' + state.name;
                console.log(errorMessage);
                throw new Error(errorMessage);
            } else {
                return Q.delay(50).then(wait);
            }
        }).catch(function(error) {
            throw error;
        });
    }
}
a-bobkov
  • 724
  • 7
  • 15