0

I'm trying to get my PhantomJS to crawl multiple pages using a while loop, however I realise that it's asynchronous and it's only returning results for the last page. Any ideas how I can get it to return a result for each page?

Here is my code below:

var  _output = {'cookies':[],'resources':{'js':[]}};
var fs = require('fs');
var file_h = fs.open('file.csv', 'r');
var line = file_h.readLine();

while(line)
{

  var page = require('webpage').create(),
      system = require('system'),
      address;

  console.log(line);

  // open web page
  phantom.cookiesEnabled = true;
  address = line;
  page.open(address, function (status) {
      console.log("status " + status);
      if(status=='success'){

        _output.cookies = phantom.cookies; // record cookies
        _output.cookies.forEach(function(cookie){
          console.log("cookie " + cookie);
        });
      }else{
        console.log('Unable to open provided URL: '+address);
        phantom.exit(-2); // -2: unable to open provided URL
      }
  });

  // to avoid errors detected while parsing the page (eg. Syntax Error, Type Error, etc.)
  // getting into stdout, so breaking the JSON decoding of returned output.
  page.onError = function (msg, trace) {

  }
  line = file_h.readLine(); 
}
file_h.close();
gadgetgem
  • 63
  • 11
  • Possible duplicate of [Using multiple page.open in one script](https://stackoverflow.com/questions/31188021/using-multiple-page-open-in-one-script) – Vaviloff May 10 '18 at 14:31
  • There are just so many of these: [Multiple page.open in one script](https://stackoverflow.com/questions/16996732/using-multiple-page-open-in-single-script), [Scraping multiple URLs by looping](https://stackoverflow.com/questions/34120421/scraping-multiple-urls-by-looping-in-phantomjs), [Need to open an array of URLs in PhantomJS](https://stackoverflow.com/questions/31732014/need-to-open-an-array-of-urls-in-phantomjs). (Please don't forget to search) – Vaviloff May 10 '18 at 14:35

1 Answers1

0

Use Promise for asynchronous tasks.

var page = require('webpage').create(),
    system = require('system');
new Promise(function(resolve, reject) {
  var  _output = {'cookies':[],'resources':{'js':[]}};
  var fs = require('fs');
  var file_h = fs.open('file.csv', 'r');
  var line = file_h.readLine();
  var promises = [];
  while(line)
  {
    promises.push(process_line(line));
  }
  file_h.close();
  Promise.all(promises).then(resolve, reject);
  function process_line(line) {
    return Promise((resolve, reject) => {
      var address;

      console.log(line);

      // open web page
      phantom.cookiesEnabled = true;
      address = line;
      page.open(address, function (status) {
        console.log("status " + status);
        if(status=='success'){

          _output.cookies = phantom.cookies; // record cookies
          _output.cookies.forEach(function(cookie){
            console.log("cookie " + cookie);
          });
          resolve(status)
        }else{
          reject('Unable to open provided URL: '+address);
        }
      });

      // to avoid errors detected while parsing the page (eg. Syntax Error, Type Error, etc.)
      // getting into stdout, so breaking the JSON decoding of returned output.
      page.onError = function (msg, trace) {
        reject(msg);
      }
      line = file_h.readLine(); 
    });
  }
})
  .then((statuses) => {
    // use the results
  })
  .catch((err) => {
    // report an error
    phantom.exit(-2); // -2: unable to open provided URL
  });
hossein
  • 313
  • 1
  • 8
  • Thanks, I'm looking at this https://www.npmjs.com/package/promise-phantom as promise doesn't work with PhantomJs – gadgetgem May 10 '18 at 10:38
  • When trying your code PhantomJS throws error `SyntaxError: Parse error` and hangs. What version did you use to test this code? – Vaviloff May 10 '18 at 14:28
  • I did not check for syntax errors. I was demonstrating how it works. – hossein May 10 '18 at 14:52