i'm trying to scrape from a cloudflare website. But whatever i do i get 403 forbidden errors.
I've read that this is because of the headless request. Is there any way to bypass this? I'll leave my current settings below. Is there anything more i can do to prevent detection?
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
(async () => {
const args = [
"--no-sandbox",
"--disable-setuid-sandbox",
"--disable-accelerated-2d-canvas",
"--no-zygote",
"--renderer-process-limit=1",
"--no-first-run",
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list",
"--disable-dev-shm-usage",
"--disable-infobars",
"--lang=en-US,en",
"--window-size=1920x1080",
"--disable-extensions",
];
const options = {
args,
headless: true,
ignoreHTTPSErrors: true,
userDataDir: "./tmp",
executablePath: "/snap/bin/chromium",
};
try {
const browser = await puppeteer.launch(options);
const page = await browser.newPage();
page.on("response", (response) => {
console.log(response.status());
});
await page.setExtraHTTPHeaders({
"Accept-Language": "en,en-US;q=0,5",
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
});
await page.waitFor(5000);
await page.goto(process.argv[2]);
await browser.close();
} catch (err) {
console.log(err);
}
})();