1

enter image description here

As a follow up question to Unable to extract footer Fields with cheerio , I'm trying to scrape the header and footer of a table with node and puppeteer. I'm eventually going to turn it into a serverless function. For now, my code is:

const puppeteer = require("puppeteer-core");
const chromium = require("@sparticuz/chromium");

exports.handler = async function (event, context) {

  // const body = JSON.parse(event.body); // postencoded
  // const apn = body.apn;
  console.log('starting ', apn);

  const browser = await puppeteer.launch({
    args: chromium.args,
    defaultViewport: chromium.defaultViewport,
    executablePath: await chromium.executablePath(),
    headless: chromium.headless,
    ignoreHTTPSErrors: true,
  });

   const page = await browser.newPage();
  const url = "https://www.to.pima.gov/propertyInquiry/?stateCodeB=129&stateCodeM=05&stateCodeP=0070";
  await page.goto(url, { waitUntil: 'networkidle0' });
  await delay(2000);

  const data = await page.evaluate(() => {
    let headTHS = Array.from(document.querySelectorAll("#tblAcctBal > thead > tr > th"));
    let footTHS = Array.from(document.querySelectorAll("#tblAcctBal > tfoot > tr > th"));
    return { "headTHS": headTHS, "footTHS": footTHS }
  });
  
  let myArr = [];

  console.log('---------');
  console.log(data);
  // console.log(data.headTHS);
  // console.log(data.footTHS);
  console.log('---------');
  for (let i = 0; i < data.headTHS.length; i++) {
    try {
      let headTh = data.headTHS[i].textContent;
      // console.log(headTh);
      let footTh = data.footTHS[i].textContent;
      // console.log(footTh);
      console.log('---------');
      myArr.push({ headTH: footTH });

    } catch (error) {
      // console.log(error);
      myArr = error;
    }
  }

  // const pageTitle = await page.title();
  await browser.close();
  // console.log(pageTitle);
  return {
    statusCode: 200,
    body: JSON.stringify({
      message: myArr
    })
  }

};

output:

    {
    headTHS: [
        { jQuery111105698387845515578: 79 },
        { jQuery111105698387845515578: 80 },
        { jQuery111105698387845515578: 81 },
        { jQuery111105698387845515578: 82 },
        { jQuery111105698387845515578: 83 },
        { jQuery111105698387845515578: 84 },
        { jQuery111105698387845515578: 85 },
        { jQuery111105698387845515578: 86 },
        { jQuery111105698387845515578: 87 },
        { jQuery111105698387845515578: 88 }
    ],
    footTHS: [
        {}, {}, {}, {}, {},
        {}, {}, {}, {}, {}
    ]
    }

[

I've tried the selectors in the devtool console and they appear correct. How do I access the text in the header and footer th tags?

user1592380
  • 34,265
  • 92
  • 284
  • 515
  • Try `[...document.querySelectorAll("#tblAcctBal > tfoot > tr > th")].map(e => e.textContent)`. You [can't return DOM nodes from the browser to Puppeteer because they're not serializable](https://stackoverflow.com/questions/46377955/puppeteer-page-evaluate-queryselectorall-return-empty-objects), so the normal approach is to pull the text content out browser-side rather than Node-side. Try to [avoid using a delay](https://stackoverflow.com/questions/46919013/puppeteer-wait-n-seconds-before-continuing-to-the-next-line/73676564#73676564). – ggorlen Mar 01 '23 at 23:54

0 Answers0