I'm trying to scrape a page of card items. I'd like to extract the titles, prices, image sources and other properties from these cards. However, when I scrape with Puppeteer and Cheerio, some of the data is missing. See the image below:
How can I make sure all of the data comes through?
This is my code:
(async () => {
try {
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer2.use(StealthPlugin());
const browser = await puppeteer2.launch({
executablePath: "/usr/bin/chromium-browser",
headless: true,
args: [
"--no-sandbox",
"--disable-setuid-sandbox",
"--user-agent=" + USER_AGENT + "",
],
});
const page = await browser.newPage({ignoreHTTPSErrors: true});
await page.setDefaultNavigationTimeout(0);
await page.goto("https://es.wallapop.com/search?keywords=", {
waitUntil: "networkidle0",
});
await page.waitForTimeout(30000);
const body = await page.evaluate(() => {
return document.querySelector("body").innerHTML;
});
var $ = cheerio.load(body);
const pageItems = $(".ItemCardList__item .ng-star-inserted")
.toArray()
.map((item) => {
const $item = $(item);
return {
// id: $item.attr('data-adid'), c10420p([^i]*)\/
id: uuid.v4(),
title: $item.find(".ItemCard__info").text(),
link: "https://es.wallapop.com/item/",
image: $item.find(".w-100").attr("src"),
price: $item
.find(".ItemCard__price")
.text()
.replace(/[_\W]+/g, ""),
empresa: "wallapop",
};
});
const allItems = items.concat(pageItems);
console.log(
pageItems.length,
"items retrieved",
allItems.length,
"acumulat ed",
);
// ...