0

I'm writing an API with express, puppeteer-cluster and cheerio that returns all anchor elements containing one or more words that can be added as query parameters. I want to use puppeteer in order to get elements that are javascript generated too. But for some reason it's not working, I get an empty array as an output printed on the browser.

I'm still trying to understand this library but has been 2 days and I made no progress. Any help is deeply appreciated.

Update: I added async to all my functions and they run now, but the result is still empty :(

Update 2: I started logging everything, every step and found that data.name is being passed to the cheerio function as a Promise. '-' I think that is the problem, but don't know how to fix it yet.

Update 3: One of the issues was that the page content (html code) was not being handled properly to the cheerio function. In the browser, however, the response is empty and the console shows an error:

Error handling response: TypeError: Cannot read properties of undefined (reading 'innerText').

So, I think the response is not json formatted. Is res.json() not the right way to do it?

My code:

app.js

const PORT = process.env.PORT || 8000;
var path = require("path");
const express = require("express");

// Routes
const indexRouter = require("./routes/index");
const allNews = require("./routes/news");
const clusterRouter = require("./routes/cluster");

const app = express();
app.use(cors());
app.use(express.json());
app.use(express.urlencoded({ extended: false }));
app.use(express.static(path.join(__dirname, "public")));

app.use("/", indexRouter);
app.use("/news", allNews);
app.use("/cluster", clusterRouter);

app.listen(PORT, () => console.log(`server running on PORT ${PORT}`));

cluster.js

const express = require("express");
const { Cluster } = require("puppeteer-cluster");
const puppeteer = require("puppeteer-extra");
const cheerio = require("cheerio");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

var router = express.Router();
const newspapers = [
  {
    "name": "CNN",
    "address": "https://edition.cnn.com/specials/world/cnn-climate",
    "base": "https://edition.cnn.com"
  },
  {
    "name": "The Guardian",
    "address": "https://www.theguardian.com/environment/climate-crisis",
    "base": "https://www.theguardian.com"
  }]

const app = express();
puppeteer.use(StealthPlugin());

const result = [];

router.get("/", async (req, res) => {
  (async () => {
    // Query String
    const query = checkForQuery(req);
    const wordsToSearch = query ? verifyQuery(query) : "";

    console.log("Running tests.."); // This is printed on console
    
    //Functions
    function checkForQuery(request) {
      if (request.originalUrl.indexOf("?") !== -1) {
        console.log(request.query);
        return request.query;
      } else {
        return false;
      }
    }

    // // Validates query and remove invalid values
    function verifyQuery(queryString) {
      const queryParams = {
        only: queryString.only ? queryString.only : "",
        also: queryString.also ? queryString.also : "",
      };
      // Creates new list containing valid terms for search
      var newList = {
        only: [],
        also: [],
      };

      for (const [key, value] of Object.entries(queryParams)) {
        const tempId = key.toString();
        const tempVal =
          queryParams[tempId].length >= 2
            ? queryParams[tempId].split(",")
            : queryParams[tempId];
        console.log(queryParams[tempId], " and ", tempVal);
        if (tempVal.length > 1) {
          console.log("helloooooo");
          tempVal.forEach((term) => {
            if (topics.indexOf(term) != -1) {
              newList[tempId].push(term);
            }
          });
        } else {
          if (topics.indexOf(queryParams[tempId]) != -1) {
            newList[tempId].push(queryParams[tempId]);
          }
        }
      }
      console.log(newList);
      return newList;
    }

    function storeData(element, base, name) {
      const results = [];
      element.find("style").remove();
      const title = element.text();
      const urlRaw = element.attr("href");
      const url =
        urlRaw.includes("www") || urlRaw.includes("http")
          ? urlRaw
          : base + urlRaw;

      // Check for duplicated url
      if (tempUrls.indexOf(url) === -1) {
        // Check for social media links and skip
        if (!exceptions.some((el) => url.toLowerCase().includes(el))) {
          tempUrls.push(url);

          // Get img if child of anchor tag
          const imageElement = element.find("img");
          if (imageElement.length > 0) {
            // Get the src attribute of the image element

            results.push({
              title: title.replace(/(\r\n|\n|\r)/gm, ""),
              url,
              source: name,
              imgUrl: getImageFromElement(imageElement),
            });
          } else {
            results.push({
              title: title.replace(/(\r\n|\n|\r)/gm, ""),
              url: url,
              source: name,
            });
          }
        }
      }
      return results;
    }

    function getElementsCheerio(html, base, name, searchterms) {
      console.log(html, base, name);
      const $ = cheerio.load(html);
      console.log(searchterms);
      const concatInfo = [];

      if (searchterms) {
        const termsAlso = searchterms.also;
        const termsOnly = searchterms.only;

        termsAlso.forEach((term) => {
          $(`a:has(:contains("climate"):contains(${term}))`).each(function () {
            const tempData = storeData($(this), base, name);
            tempData.map((el) => concatInfo.push(el));
          });
        });

        termsOnly.forEach((term) => {
          // $(`a:has(:contains(${term}))`).each(function () {
          $(`a:contains(${term})`).each(function () {
            const tempData = storeData($(this), base, name);
            tempData.map((el) => concatInfo.push(el));
          });
        });
      } else {
        $('a:contains("climate")').each(function () {
          const tempData = storeData($(this), base, name);
          tempData.map((el) => concatInfo.push(el));
        });
      }
      return concatInfo;
    }
    
    const cluster = await Cluster.launch({
      concurrency: Cluster.CONCURRENCY_CONTEXT,
      maxConcurrency: 2,

      puppeteerOptions: {
        headless: true,
        args: ["--no-sandbox", "--disable-setuid-sandbox"],
        userDataDir: "./tmp",
        defaultViewport: false,
      },
    });

    await cluster.task(async ({ page, data }) => {
      await page.goto(data.address);
      await page.waitForSelector("body");
      
      // console.log here prints that data.name is a Promise :(
      const elements = await getElementsCheerio(
        document.body.innerHTML,
        data.base, 
        data.name,
        wordsToSearch
      );
      result.push(elements);
    });

    newspapers.map((newspaper) => {
      console.log("queue" + newspaper); // This logs correctly: queue[object Object]
      cluster.queue(newspaper);
    });

    await cluster.idle();
    await cluster.close();

    // Display final object 
    res.json(result);
  })();
});

module.exports = router;

I don't get any errors, but on screen I get an empty [ ]. Anyone can see what I am doing wrong here? :(

ggorlen
  • 44,755
  • 7
  • 76
  • 106
Bella
  • 414
  • 2
  • 13
  • The issue is probably [How do I return the response from an asynchronous call?](https://stackoverflow.com/questions/14220321/how-do-i-return-the-response-from-an-asynchronous-call). What's `getElementsCheerio` do? See [mcve]. Why use Cheerio when Puppeteer can [already select on the live page](https://serpapi.com/blog/puppeteer-antipatterns/#using-a-separate-html-parser-with-puppeteer)? – ggorlen Dec 19 '22 at 18:57
  • Saw all that, still don't see what I did wrong. I'm using cheerio, first because the whole function is ready, I was using with axios before, but I want to get Js content too. And second, I want to get anchor elements that contain one or more specific words, its children and parents, and from what I read, it's easier with cheerio, as it is :) My cheerio function iterates through the query parameters and collect the proper elements. – Bella Dec 19 '22 at 20:02
  • Why is the cheerio func async though? Cheerio is 100% sync. I still don't see a [mcve]. I don't see any problems in the current code. The task queue seemed "off" initially with `result.push()` but looks OK offhand on closer inspection, so a complete, runnable example would be necessary to help. – ggorlen Dec 19 '22 at 20:18
  • I added the whole code now. I don't understand either why the other functions have to be async, but that's the only way I could make it work – Bella Dec 19 '22 at 20:36
  • But it's not working, right? If you're not using `await` in a func, there's no need to make it `async`. Thanks for the code, but it's still not minimal or runnable because I don't have your newspapers JSON. Can you strip out the unnecessary code and boil it down to something simple I can run and reproduce? If you `console.log(concatInfo)` before `return concatInfo;` does it show anything? Maybe the site(s) are blocking you and the HTML isn't what you assume it is. You can verify that without Express or Puppeteer-cluster, just a few lines of Puppeteer code. – ggorlen Dec 19 '22 at 20:37
  • You're right :) Removed the async and get to the same place. In ```await cluster.task(async ({ page, data })```, data.name comes as a Promise... that's probably why the function doesn't run properly – Bella Dec 19 '22 at 20:45
  • omg, so sorry!! I added 2 elements from my newspaper array – Bella Dec 19 '22 at 20:47
  • Again, these functions worked perfectly with axios, I don't think they are the problem :( at least something should come out of them. By the way, thank you sooooo much for the help S2 – Bella Dec 19 '22 at 20:49
  • It's still a bit of a pain to set up and run, a lot of code and files to put together, but looking at it again, what's `document.body.innerHTML`? Usually with Puppeteer you'd need to use `await page.content()` to get a Cheerio-compatible string, no? Is `puppeteer-cluster` smart enough to provide `document` in the callback? Anyway, if that's the problem then it just affirms my antipattern from the first post--the extra layer of indirection between Puppeteer and a third-party parser winds up causing more trouble than its worth, but if you're reusing pre-existing Cheerio code, I understand. – ggorlen Dec 19 '22 at 21:01
  • You're a genius. :D Fixed that :) The browser shows an error on console tho. Seems like the response is not json? I updated in the question – Bella Dec 19 '22 at 21:27
  • 1
    I managed. It's workiiiiiiing :D Thank you so much! Do you want to answer with the page.content() issue so I can accept? That change made the magic possible :) – Bella Dec 19 '22 at 21:50

1 Answers1

1

In general, it's an antipattern to mix Puppeteer with another selection library like Cheerio. In addition to being redundant, the extra HTML parser doesn't work on the live document as Puppeteer does, so you have to snapshot the HTML at a particular moment with Puppeteer to capture it as a string and plug that string into Cheerio, where it's re-parsed back to a traversible tree structure.

Introducing this extra step creates opportunity for bugs and confusion to creep in, and that's what happened here.

The code

const elements = await getElementsCheerio(
    document.body.innerHTML,
    data.base, 
    data.name,
    wordsToSearch
);

is problematic. document.body.innerHTML doesn't refer to anything related to Puppeteer. Instead, use Puppeteer's await page.content() to snapshot the HTML.

As a minor point, there's no need for Cheerio functions to be async, because they never use await. It's a fully synchronous API.

Here's a minimal set up for using Cheerio with Puppeteer, assuming you accept the terms and conditions and are sure that intoducing this usually unnecessary layer of indirection is appropriate for your use case:

const cheerio = require("cheerio"); // 1.0.0-rc.12
const puppeteer = require("puppeteer"); // ^19.0.0

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  const url = "https://www.example.com";
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const html = await page.content();
  const $ = cheerio.load(html);

  // do cheerio stuff synchronously
  console.log($("h1").text()); // => Example Domain
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

It's basically the same for puppeteer-cluster: just drop the lines starting with const html = await page.content(); into the cluster.task callback that operates on page.

ggorlen
  • 44,755
  • 7
  • 76
  • 106