30

Puppeteer and PhantomJS are similar. The issue I'm having is happening for both, and the code is also similar.

I'd like to catch some informations from a website, which needs authentication for viewing those informations. I can't even access home page because it's detected like a "suspicious activity", like the SS: https://i.stack.imgur.com/Atovn.png

I discovered that the problem doesn't happen when I tested on Postman using a header named Cookie and the value of it's cookie caught on browser, but this cookie expires after some time. So I guess Puppeteer/PhantomJS both are not catching cookies, because this site is denying the headless browser access.

What could I do for bypass this?

// Simple Javascript example
var page = require('webpage').create();
var url = 'https://www.expertflyer.com';

page.open(url, function (status) {
    if( status === "success") {
        page.render("home.png");
        phantom.exit();
    }
});

4 Answers4

24

If anyone need in future for the same problem. Using puppeteer-extra

I have tested the code on a server. On 2nd run there is google Captcha. You can solve it your self and restart the bot or use a Captcha solving service.

I did run the code more than 10 times there is no ip ban. I did not get captcha again on my continues run.

But you can get captcha again!

//sudo npm install puppeteer puppeteer-extra puppeteer-extra-plugin-stealth puppeteer-extra-plugin-adblocker readline
var headless_mode = process.argv[2]

const readline = require('readline');
const puppeteer = require('puppeteer-extra')
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin())
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker')
puppeteer.use(AdblockerPlugin({ blockTrackers: true }))


async function run () {
  const browser = await puppeteer.launch({
    headless:(headless_mode !== 'true')? false : true,
    ignoreHTTPSErrors: true,
    slowMo: 0,
    args: ['--window-size=1400,900',
    '--remote-debugging-port=9222',
    "--remote-debugging-address=0.0.0.0", // You know what your doing?
    '--disable-gpu', "--disable-features=IsolateOrigins,site-per-process", '--blink-settings=imagesEnabled=true'
    ]})

  const page = await browser.newPage();


  console.log(`Testing expertflyer.com`)
  //await page.goto('https://www.expertflyer.com')
  await goto_Page('https://www.expertflyer.com')
  await waitForNetworkIdle(page, 3000, 0)
  //await page.waitFor(7000)
  await checking_error(do_2nd_part)




  async function do_2nd_part(){
    try{await page.click('#yui-gen2 > a')}catch{}
    await page.waitFor(5000)
    var seat = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(seat, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer1.png'})

    await checking_error(do_3nd_part)
  }

  async function do_3nd_part(){
    try{await page.click('#yui-gen1 > a')}catch{}
    await page.waitFor(5000)
    var pro = '#headerTitleContainer > h1'
    try{console.log(await page.$eval(pro, e => e.innerText))}catch{}
    await page.screenshot({ path: 'expertflyer2.png'})

    console.log(`All done, check the screenshots?`)
  }


  async function checking_error(callback){
    try{
      try{var error_found = await page.evaluate(() => document.querySelectorAll('a[class="text yuimenubaritemlabel"]').length)}catch(error){console.log(`catch error ${error}`)}

      if (error_found === 0) {
        console.log(`Error found`)
        var captcha_msg = "Due to suspicious activity from your computer, we have blocked your access to ExpertFlyer. After completing the CAPTCHA below, you will immediately regain access unless further suspicious behavior is detected."
        var ip_blocked = "Due to recent suspicious activity from your computer, we have blocked your access to ExpertFlyer. If you feel this block is in error, please contact us using the form below."
        try{var error_msg = await page.$eval('h2', e => e.innerText)}catch{}
        try{var error_msg_details = await page.$eval('body > p:nth-child(2)', e => e.innerText)}catch{}

        if (error_msg_details == captcha_msg) {
          console.log(`Google Captcha found, You have to solve the captch here manually or some automation recaptcha service`)

          await verify_User_answer()
          await callback()
        } else if (error_msg_details == ip_blocked) {
          console.log(`The current ip address is blocked. The only way is change the ip address.`)
        } else {
          console.log(`Waiting for error page load... Waiting for 10 sec before rechecking...`)
          await page.waitFor(10000)
          await checking_error()
        }

      } else {
        console.log(`Page loaded successfully! You can do things here.`)
        await callback()
      }

    }catch{}
  }

  async function goto_Page(page_URL){
    try{
      await page.goto(page_URL, { waitUntil: 'networkidle2', timeout: 30000 });
    } catch {
      console.log(`Error in loading page, re-trying...`)
      await goto_Page(page_URL)
    }
  }

  async function verify_User_answer(call_back){
      user_Answer = await readLine();

      if (user_Answer == 'yes') {
        console.log(`user_Answer is ${user_Answer}, Processing...`)
        // Not working what i want. Will fix later
        // Have to restart the bot after solving
        await call_back()
      } else {
        console.log(`answer not match. try again...`)

        var user_Answer = await readLine();
        console.log(`user_Answer is ${user_Answer}`)
        await verify_User_answer(call_back)
      }
    }

    async function readLine() {

      const rl = readline.createInterface({
        input: process.stdin,
        output: process.stdout
      });

      return new Promise(resolve => {

        rl.question('Solve the captcha and type yes to continue: ', (answer) => {
          rl.close();
          resolve(answer)
        });
      })
    }

  async function waitForNetworkIdle(page, timeout, maxInflightRequests = 0) {
  console.log('waitForNetworkIdle called')
  page.on('request', onRequestStarted);
  page.on('requestfinished', onRequestFinished);
  page.on('requestfailed', onRequestFinished);

  let inflight = 0;
  let fulfill;
  let promise = new Promise(x => fulfill = x);
  let timeoutId = setTimeout(onTimeoutDone, timeout);
  return promise;

  function onTimeoutDone() {
    page.removeListener('request', onRequestStarted);
    page.removeListener('requestfinished', onRequestFinished);
    page.removeListener('requestfailed', onRequestFinished);
    fulfill();
  }

  function onRequestStarted() {
    ++inflight;
    if (inflight > maxInflightRequests)
      clearTimeout(timeoutId);
  }

  function onRequestFinished() {
    if (inflight === 0)
      return;
    --inflight;
    if (inflight === maxInflightRequests)
      timeoutId = setTimeout(onTimeoutDone, timeout);
  }
}


  await browser.close()
}
run();

Please note "Solve the captcha and type yes to continue: " method not working as expected, Need some fixing.

Edit: Re-run the bot after 10 minutes got captcha again. Solved captcha on chrome://inspect/#devices restarted the bot, everything working again. No ip ban.

Shimul D
  • 403
  • 4
  • 9
20

Things that can help in general :

  • Headers should be similar to common browsers, including :
  • If you make multiple request, put a random timeout between them
  • If you open links found in a page, set the Referer header accordingly
  • Images should be enabled
  • Javascript should be enabled
    • Check that "navigator.plugins" and "navigator.language" are set in the client javascript page context
  • Use proxies
Grubshka
  • 533
  • 4
  • 9
9

If you think from the websites perspective, you are indeed doing suspicious work. So whenever you want to bypass something like this, make sure to think how they are thinking.

Set cookie properly

Puppeteer and PhantomJS etc will use real browsers and the cookies used there are better than when using via postman or such. You just need to use cookie properly.

You can use page.setCookie(...cookies) to set the cookies. Cookies are serialized, so if cookies is an array of object, you can simply do this,

const cookies = [{name: 'test', value: 'foo'}, {name: 'test2', value: 'foo'}]; // just as example, use real cookies here;
await page.setCookie(...cookies);

Try to tweak the behaviors

Turn off the headless mode and see the behavior of the website.

await puppeteer.launch({headless: false})

Try proxies

Some websites monitor based on Ip address, if multiple hits are from same IP, they blocks the request. It's best to use rotating proxies on that case.

Md. Abu Taher
  • 17,395
  • 5
  • 49
  • 73
5

The website you are trying to visit uses Distil Networks to prevent web scraping.

People have had success in the past bypassing Distil Networks by substituting the $cdc_ variable found in Chromium's call_function.js (which is used in Puppeteer).

For example:

function getPageCache(opt_doc, opt_w3c) {
  var doc = opt_doc || document;
  var w3c = opt_w3c || false;
  // var key = '$cdc_asdjflasutopfhvcZLmcfl_';    <-- This is the line that is changed.
  var key = '$something_different_';
  if (w3c) {
    if (!(key in doc))
      doc[key] = new CacheWithUUID();
    return doc[key];
  } else {
    if (!(key in doc))
      doc[key] = new Cache();
    return doc[key];
  }
}

Note: According to this comment, if you have been blacklisted before you make this change, you face another set of challenges, so you must "implement fake canvas fingerprinting, disable flash, change IP, and change request header order (swap language and Accept headers)."

Grant Miller
  • 27,532
  • 16
  • 147
  • 165