3

i'm trying to scrape from a cloudflare website. But whatever i do i get 403 forbidden errors.

I've read that this is because of the headless request. Is there any way to bypass this? I'll leave my current settings below. Is there anything more i can do to prevent detection?

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());

(async () => {
  const args = [
    "--no-sandbox",
    "--disable-setuid-sandbox",
    "--disable-accelerated-2d-canvas",
    "--no-zygote",
    "--renderer-process-limit=1",
    "--no-first-run",
    "--ignore-certificate-errors",
    "--ignore-certificate-errors-spki-list",
    "--disable-dev-shm-usage",
    "--disable-infobars",
    "--lang=en-US,en",
    "--window-size=1920x1080",
    "--disable-extensions",
  ];

  const options = {
    args,
    headless: true,
    ignoreHTTPSErrors: true,
    userDataDir: "./tmp",
    executablePath: "/snap/bin/chromium",
  };

  try {
    const browser = await puppeteer.launch(options);
    const page = await browser.newPage();

    page.on("response", (response) => {
      console.log(response.status());
    });

    await page.setExtraHTTPHeaders({
      "Accept-Language": "en,en-US;q=0,5",
      Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
    });

    await page.waitFor(5000);

    await page.goto(process.argv[2]);

    await browser.close();
  } catch (err) {
    console.log(err);
  }
})();
Behemoth
  • 5,389
  • 4
  • 16
  • 40
bas kruithof
  • 41
  • 1
  • 2
  • 1
    Cloudflare have a lot of stuff going on to avoid unknown bots/scrapers. Running in headless mode seems to be one thing that makes this fail. https://stackoverflow.com/questions/58630498/why-does-cloudflare-403-only-on-headless-requests https://github.com/puppeteer/puppeteer/issues/7006 – Karl-Johan Sjögren Aug 08 '21 at 09:01
  • 1
    Here's the solution https://stackoverflow.com/a/55714786/7058031 – hamza ajaz Dec 15 '21 at 14:28

0 Answers0