2

I'm trying to scrape prices from multiple pages using puppeteer. What i'm having trouble with, is to write a single JSON file with all the scraped data. The problem is that if i try to write the file with the variables from inside the async function, i get an error saying that that variable hasn't been declared.

async function scrapeVMZ(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url);

    const [vmzel1] = await page.$x('//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[2]');
    const vmztxt1 = await vmzel1.getProperty('textContent');
    const vmzRawTxt1 = await vmztxt1.jsonValue();


    const [vmzel2] = await page.$x('//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[4]/b');
    const vmztxt2 = await vmzel2.getProperty('textContent');
    const vmzRawTxt2 = await vmztxt2.jsonValue();

    console.log({vmzRawTxt1, vmzRawTxt2});
    const vmz01 = JSON.stringify(vmzRawTxt1);
    const vmz02 = JSON.stringify(vmzRawTxt2);
    console.log(vmz01, vmz02);
    browser.close();
}
scrapeVMZ('https://www.vmzviagens.com.br/ingressos/orlando/walt-disney-orlando');


async function scrapeMB(url) {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto(url);

    
    const [mbel1] = await page.$x('/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[2]/span');
    const mbtxt1 = await mbel1.getProperty('textContent');
    const mbRawTxt1 = await mbtxt1.jsonValue();


    
    const [mbel2] = await page.$x('/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[4]/span');
    const mbtxt2 = await mbel2.getProperty('textContent');
    const mbRawTxt2 = await mbtxt2.jsonValue();

    console.log({mbRawTxt1, mbRawTxt2});
    const mb01 = JSON.stringify(mbRawTxt1);
    const mb02 = JSON.stringify(mbRawTxt2);
    console.log(mb01, mb02);
   
    browser.close();
}
scrapeMB('https://www.ingressosmagicblue.com.br/produtos/?mpage=2');

How can i write a file, using the code above, to store inside my JSON file, the variables vmz01, vmz02 and mb01, mb02, like the example below?

let abc = {        
        "MB": {
            preco: mb01,
            preco2: mb02
        },
        "VMZ": {
            preco: vmz01,
            preco2: vmz02
        }
    };
ggorlen
  • 44,755
  • 7
  • 76
  • 106
Fadrick
  • 69
  • 8

1 Answers1

1

When console.log appears in a function instead of returning results, that's a dead end. Return the results if you want to use them later. Since you're returning promises, you can await them in the caller, either serially or in parallel.

There's also a lot of repeated code in your functions, and you probably don't need 2 browsers. Here's a quick refactor that runs in parallel in a single browser (the preco keys are sort of awkward--I'd suggest an array here potentially).

const fs = require("fs").promises;
const puppeteer = require("puppeteer"); // ^14.3.0

const vmzPaths = [
  '//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[2]',
  '//*[@id="__layout"]/div/div[1]/section/div/div/div[2]/div[2]/div[1]/div/div/div[2]/div/div[1]/span[4]/b',
];

const mbPaths = [
  "/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[2]/span",
  "/html/body/section[3]/div/div/div[2]/div[1]/div/div[2]/a[1]/span[2]/span/div/div[4]/span",
];

const scrape = async (browser, url, paths) => {
  const page = await browser.newPage();
  await page.goto(url);
  return Promise.all(paths.map(async p =>
    (await page.waitForXPath(p)).evaluate(e => e.textContent)
  ));
};

let browser;
(async () => {
  browser = await puppeteer.launch({headless: true});
  const text = await Promise.all([
    scrape(browser, "https://www.ingressosmagicblue.com.br/produtos/?mpage=2", mbPaths),
    scrape(browser, "https://www.vmzviagens.com.br/ingressos/orlando/walt-disney-orlando", vmzPaths),
  ]);
  const names = ["MB", "VMZ"];
  const collected = Object.fromEntries(text.map((e, i) => [
    names[i], Object.fromEntries(e.map((e, i) =>
      [`preco${i === 0 ? "" : (i + 1)}`, e]
    ))
  ]));
  await fs.writeFile("out.json", JSON.stringify(collected, null, 2));
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close())
;

As an aside, I'm not a big fan of hyper-precise, browser generated paths and selectors. These tend to be super brittle, and there's almost always a better way to choose a selector. But I haven't looked at the page in the interests of focusing on the promises issue, so I'll leave that as an exercise for the reader.

ggorlen
  • 44,755
  • 7
  • 76
  • 106
  • I'm still figuring out how to work with puppeteer (actually, with javascript as well), so thanks for not only helping with my request, but also for pointing out what should be improved! I'll try to learn a better way to choose a selector from those pages! – Fadrick Jun 29 '22 at 14:35