-1

I am using the following code with Puppeteer to check if the relevant section has loaded or not, but it always returns a loading error. What could be the reason for this?

Which part missing I dont understand.

I am using the below code:

async function getResults(lnk) {
  const results = [];
  const timeFrames = [1, 5, 15];

  const browser = await puppeteer.launch({
    headless: true,

    args: [
      "--no-sandbox",
      "--disable-setuid-sandbox",
      "--disable-dev-shm-usage",
      "--single-process",
      "--no-zygote",
      // "--remote-debugging-port"
    ],
    timeout: 0,

    executablePath:
      process.env.NODE_ENV === "production"
        ? process.env.PUPPETEER_EXECUTABLE_PATH
        : puppeteer.executablePath(),
  });

  const page = await browser.newPage();

  for (const i of timeFrames) {
    const url = `${lnk}?timeFrame=${i * 60}`;
    console.log(`Getting: ${url}`);

    await page.goto(url, {waitUntil: "networkidle0"});

    console.log("Process 1");
    // await page.waitForTimeout(10000); // wait for 10 seconds
    // await page.waitForNavigation({waitUntil: "domcontentloaded"});
    // await page.waitForTimeout(1000)

    await page
      .waitForSelector("section.forecast-box-graph")
      .then(async () => {
        // await page.waitForSelector('h1.main-title.js-main-title');
        console.log("Getting");

        const status = await page.$eval(
          "section.forecast-box-graph .title",
          el => el.textContent
        );
        const Bank_Name = await page.$eval(
          "h1.main-title.js-main-title",
          el => el.textContent.trim()
        );
        results.push(status);
        await page.close();
      })
      .catch(() => console.log("Loading error"));
  }
  await browser.close();

  results.push(lnk.split("/").pop().split("-").join(" "));

  return results;
}

And this is the related part. and link here https://in.investing.com/equities/axis-bank-technical?timeFrame=60 enter image description here

ggorlen
  • 44,755
  • 7
  • 76
  • 106
hobik
  • 439
  • 4
  • 15
  • What error does it throw specifically, and what are you trying to achieve here? Instead of printing `"loading error"`, I would print the actual error so you can understand what the problem might be. I suggest [not combining `await` and `then`](https://stackoverflow.com/a/75785234/6243352) – ggorlen May 09 '23 at 20:17
  • The error I get is ```TimeoutError: Waiting for selector `section.forecast-box-graph` failed: Waiting failed: 30000ms exceeded``` which offers a good deal of actionability. The selector you want isn't on the page. Your screenshot appears to be a different page, or a mobile view of the same page. Do you need to click on something for that view to appear? – ggorlen May 09 '23 at 20:25
  • Worth a look: [Crawling multiple URLs in a loop using Puppeteer](https://stackoverflow.com/questions/46293216/crawling-multiple-urls-in-a-loop-using-puppeteer) – ggorlen May 09 '23 at 23:40

1 Answers1

3

You're closing the page in the wrong place,and putting everything into the for loop like below along with increasing the viewport size would solve your problems.

const puppeteer = require("puppeteer");

let browser;
(async () => {    

async function getResults(lnk) {
    let results = [];
    const timeFrames = [1, 5, 15];

    for (const i of timeFrames) {            
        const browser = await puppeteer.launch({headless: true});
        const page = await browser.newPage();
        await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36");
        await page.setViewport({width: 1920, height: 1080});

        const url = `${lnk}?timeFrame=${i * 60}`;
        console.log(url);

        await page.goto(url, {waitUntil: "networkidle2", timeout: 70000});

        await page.waitForSelector('section.forecast-box-graph');

        const status = await page.$eval("section.forecast-box-graph .title", el => el.textContent);
        const bank_name = await page.$eval("h1.main-title.js-main-title", (el) => el.textContent.trim());

        results.push({
            bankName: bank_name,
            status: status,
            lnk: lnk.replace(/-/g, ' ').split('/').pop(),
            url: url
        });

        await browser.close();
    }

    return results;

}

console.log( await getResults('https://in.investing.com/equities/axis-bank-technical'));


})().catch(err => console.error(err)). finally(() => browser?. close());

How to run on Render (source)

package.json - add the following dependencies

"dotenv": "^16.0.3",
"express": "^4.18.2",
"puppeteer": "^20.1.2"

index.js

const express = require("express");
const { scrape } = require('./scrape');
const app = express();

const PORT = process.env.PORT || 3000;


app.get("/scrape", (req,res) => {
    scrape(req.query.url,res);
});

app.get("/", (req, res) => {
    res.send("test is running");
});

app.listen(PORT, () => {
    console.log(`Listening on port ${PORT}`);
});

DockerFile

FROM ghcr.io/puppeteer/puppeteer:20.1.2

ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
    PUPPETEER_EXECUTABLE_PATH=/usr/bin/google-chrome-stable

WORKDIR /usr/src/app

COPY package*.json ./
RUN npm ci
COPY . .
CMD [ "node", "index.js" ]

scrape.js - modify the above code a bit

const puppeteer = require('puppeteer');
require("dotenv").config();

const scrape = async (lnk,res) => {

    const browser = await puppeteer.launch({
        headless: true, // "new" gives error on render
        args: [
            "--no-sandbox",
            "--disable-setuid-sandbox",
            "--disable-dev-shm-usage",
            "--single-process",
            "--no-zygote",
        ], 
        executablePath: 
            process.env.NODE_ENV === 'production'
                ? process.env.PUPPETEER_EXECUTABLE_PATH
                : puppeteer.executablePath(),
    });
    let results = [];
    const timeFrames = [1, 5, 15];

    for (const i of timeFrames) {            
        const page = await browser.newPage();   
        const url = `${lnk}?timeFrame=${i * 60}`;
        console.log(url);
        try {            
            await page.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36");
            await page.setViewport({width: 1920, height: 1080});

            //skip loading non-essentials
            await page.setRequestInterception(true);
            await page.on('request', async (req) => (/image|imageset|media|stylesheet|font|script/.test(req.resourceType()) && !req.isInterceptResolutionHandled()) 
                ? await req.respond({status: 200, body: 'aborted'}) 
                : await req.continue()
            );

            await page.goto(url, {waitUntil: "load", timeout: 7000});

            await page.waitForSelector('section.forecast-box-graph');

            const status = await page.$eval("section.forecast-box-graph .title", el => el.textContent);
            const bank_name = await page.$eval("h1.main-title.js-main-title", (el) => el.textContent.trim());

            results.push({
                bankName: bank_name,
                status: status,
                lnk: lnk.replace(/-/g, ' ').split('/').pop(),
                url: url
            });
        } catch (err) {
            results.push({
                url : url,
                lnk: lnk.replace(/-/g, ' ').split('/').pop(),
                error : err
            });
        } finally {
            await page.close();
        }      
    }

    res.send(results);

    await browser.close();

};

module.exports = {scrape};

.gitignore

/node_modules
  • push everything to a new github repository,
  • open Render, create NEW webservice, connect or add your git repository
  • Name: anything, Runtime : should be Docker,
  • click Advanced, add environment variable
    • key: PUPPETEER_SKIP_CHROMIUM_DOWNLOAD, value: true
    • key: PUPPETEER_EXECUTABLE_PATH, value: /usr/bin/google-chrome-stable
  • click Create Web Service
  • wait until setup is complete, and go to the <URL> it gives you, you'll see the 'test is running' message.
  • next go to <URL>/scrape?url=https://in.investing.com/equities/axis-bank-technical and you'll get the result

Note(s) :

  • waitUntil can be changed from "load" to "networkidle0" or "networkidle2", they all work.
  • Changing headless: true to headless: "new" as the deprecation warning suggests, gives errors on Render, even-though it works fine locally.
idchi
  • 761
  • 1
  • 5
  • 15
  • Hello thank you for your comment. I use this code on my Server in Render, I have this error: Ran out of memory (used over 512MB) while running your code. – hobik May 09 '23 at 21:13
  • and also timeout error sr/src/app/node_modules/puppeteer-core/lib/cjs/puppeteer/common/LifecycleWatcher.js:162 May 10 12:11:07 AM return new Errors_js_1.TimeoutError(errorMessage); May 10 12:11:07 AM ^ May 10 12:11:07 AM May 10 12:11:07 AM TimeoutError: Navigation timeout of 80000 ms exceeded May 10 12:11:07 AM at LifecycleWatcher._LifecycleWatcher_createTimeoutPromise (/usr/src/app/node_modules/puppeteer-core/lib/cjs/puppeteer/common/LifecycleWatcher.js:162:12) – hobik May 09 '23 at 21:14
  • @hobik try being more specific where you run your code next time, added relative parts above. – idchi May 10 '23 at 03:43