1

Here is my scraper

const puppeteer = require("puppeteer");

async function openPage() {
  const browser = await puppeteer.launch({ headless: true });
  const page = await browser.newPage();

  await page.setViewport({ width: 1000, height: 926 });
  await page.goto("https://www.livescore.com/en/");

  return page;
}

async function scrapeData(page) {
  let content = [];

  // Get competition data
  await page.waitForSelector(".Gd");
  const compElements = await page.$$(".Gd");

  // Loop through competition elements
  for (let i = 0; i < compElements.length; i++) {
    let compName = await compElements[i].$(".Hd");
    console.log("looping trough comp elements")

    if (compName) {
      const typeCompText = await compName.evaluate(node => node.textContent);
      const currentComp = typeCompText; // Assign the competition here
      
      let matchElements = await page.$$(".Ip");

      // Looping through match elements
      for (let j = 0; j < matchElements.length; j++) {
        let homeTeamElement = await matchElements[j].$(".vp");
        let awayTeamElement = await matchElements[j].$(".wp");
        let homeScoreElement = await matchElements[j].$(".Cp");
        let awayScoreElement = await matchElements[j].$(".Dp");
        let matchTime = await matchElements[j].$(".qs", ".vs");

        if (homeTeamElement && awayTeamElement && homeScoreElement && awayScoreElement) {
          const homeTeamText = await homeTeamElement.evaluate(node => node.textContent);
          const awayTeamText = await awayTeamElement.evaluate(node => node.textContent);
          const homeScoreText = await homeScoreElement.evaluate(node => node.textContent);
          const awayScoreText = await awayScoreElement.evaluate(node => node.textContent);
          const matchTimeText = await matchTime.evaluate(node => node.textContent);

          content.push({
            matchTime: matchTimeText,
            homeTeam: homeTeamText,
            awayTeam: awayTeamText,
            homeScore: homeScoreText,
            awayScore: awayScoreText,
            competition: currentComp,
          });
        };
      };
    };
  };

  return content;
};

(async () => {
  const page = await openPage();
  const dataScraped = await scrapeData(page);
  console.log(dataScraped);

  await page.browser().close();
})();

And here is the output when I run it

  • looping trough comp elements looping trough comp elements looping trough comp elements looping trough comp elements looping trough comp elements
  • [ { matchTime: 'FT', homeTeam: 'Crystal Palace', awayTeam: 'Arsenal', homeScore: '0', awayScore: '1', competition: 'Premier League' },

{ matchTime: 'FT', homeTeam: 'Deportivo Alaves', awayTeam: 'Sevilla', homeScore: '4', awayScore: '3', competition: 'Premier League' },

  • { matchTime: "90+4'", homeTeam: 'Granada', awayTeam: 'Rayo Vallecano', homeScore: '0', awayScore: '2', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Torino', awayTeam: 'Cagliari', homeScore: '0', awayScore: '0', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Bologna', awayTeam: 'AC Milan', homeScore: '0', awayScore: '2', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Ankaragucu', awayTeam: 'Adana Demirspor', homeScore: '1', awayScore: '1', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Samsunspor', awayTeam: 'Fenerbahce', homeScore: '0', awayScore: '2', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Famalicao', awayTeam: 'Moreirense', homeScore: '0', awayScore: '0', competition: 'Premier League' },

  • { matchTime: 'FT', homeTeam: 'Crystal Palace', awayTeam: 'Arsenal', homeScore: '0', awayScore: '1', competition: 'LaLiga' },

  • { matchTime: 'FT', homeTeam: 'Deportivo Alaves', awayTeam: 'Sevilla', homeScore: '4', awayScore: '3', competition: 'LaLiga' },

  • { matchTime: "90+4'", homeTeam: 'Granada', awayTeam: 'Rayo Vallecano', homeScore: '0', awayScore: '2', competition: 'LaLiga' },

  • { matchTime: 'FT', homeTeam: 'Torino', awayTeam: 'Cagliari', homeScore: '0', awayScore: '0', competition: 'LaLiga' },

  • { matchTime: 'FT', homeTeam: 'Bologna', awayTeam: 'AC Milan', homeScore: '0', awayScore: '2', competition: 'LaLiga' },

  • { matchTime: 'FT', homeTeam: 'Ankaragucu', awayTeam: 'Adana Demirspor', homeScore: '1', awayScore: '1', competition: 'LaLiga' },

As you can see it just logs duplicates of the matches and assigns the wrong league. Any pointers in the right direction would be appreciated:) Critique in general on my code would also be nice.

Mannen
  • 35
  • 5
  • Why not use the working code from [Closure of Puppeteer Browser After Clicking Cookie Accept Button - Unexpected Behavior](https://stackoverflow.com/questions/76909200/closure-of-puppeteer-browser-after-clicking-cookie-accept-button-unexpected-be/76909667#76909667)? This is very overcomplicated and doesn't take advantage of the containers to organize each result element. I've also advised not using ElementHandles because of exactly this complexity. – ggorlen Aug 21 '23 at 21:46
  • Frankly, because I did (do) not understand what the code you wrote did. I will try your way, read some of the docs and come back. Thank you. – Mannen Aug 21 '23 at 21:55
  • 1
    If you ask, I can try to clarify what you didn't understand. – ggorlen Aug 21 '23 at 21:57

1 Answers1

2

There's already nearly-working code here. The advice in that thread applies equally here--avoid ElementHandles.

Selecting elements in "parallel" and assuming every els[j] will match up is usually an unreliable approach. The typical approach is to respect the nesting structure and either work top-down (first select the outermost container that contains all information you want, then optionally flatten out child containers) or bottom-up (work on a row-wise element, and use .closest() to pop up to parents as necessary).

Here's the bottom-up approach, a small modification of the existing code:

const puppeteer = require("puppeteer"); // ^21.0.2

const url = "<Your URL>";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.goto(url, {waitUntil: "domcontentloaded"});
  await page.waitForSelector(".Ip");
  const content = await page.$$eval(".Ip", els =>
    els.map(e => {
      const text = (id, container = e) =>
        container.querySelector(`[id*=${id}]`).textContent.trim();
      const league = e.closest("[data-known-size]");
      return {
        time: text("status-or-time"),
        home: text("home-team-name"),
        away: text("away-team-name"),
        homeTeamScore: +text("home-team-score"),
        awaitTeamScore: +text("away-team-score"),
        competition: text("category-header__stage", league),
        country: text("category-header__category", league),
      };
    })
  );
  console.log(content);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

Even easier: they have a public API which returns JSON, so you could grab the data from there without Puppeteer. This avoids having to scroll the container to get missing rows. Here's an example with jq and curl:

curl -s "https://prod-public-api.livescore.com/v1/api/app/date/soccer/20230821/-7?MD=1&countryCode=US" |
jq '[
  .Stages |
  .[] as $parent |
  $parent.Events[] |
  {
    league: $parent.Snm,
    country: $parent.Cnm,
    home: .T1[0].Nm,
    away: .T2[0].Nm,
    homeTeamScore: .Tr1,
    awayTeamScore: .Tr2,
    status: .Eps,
    time: .Esd,
  }
]'

And in vanilla Node JS 18+:

const url = "https://prod-public-api.livescore.com/v1/api/app/date/soccer/20230821/-7?MD=1&countryCode=US";

fetch(url)
  .then(res => {
    if (!res.ok) {
      throw Error(res.statusText);
    }

    return res.json();
  })
  .then(data => {
    const events = data.Stages.flatMap(league =>
      league.Events.map(event => ({
        league: league.Snm,
        country: league.Cnm,
        home: event.T1[0].Nm,
        away: event.T2[0].Nm,
        homeTeamScore: +event.Tr1,
        awayTeamScore: +event.Tr2,
        status: event.Eps,

        // optionally, parse the time
        time: event.Esd.toString()
          .replace(
            /^(\d{4})(\d\d)(\d\d)(\d\d)(\d\d)(\d\d)$/g,
            (...args) =>
              `${args.slice(1, 4).join(".")} ${args.slice(4, 7).join(":")}`
          ),
      }))
    );
    console.log(events);
  })
  .catch(err => console.error(err));
ggorlen
  • 44,755
  • 7
  • 76
  • 106
  • Thank you very much! This worked perfectly. – Mannen Aug 22 '23 at 11:59
  • Hi again, where did you find their official api? Can't seem to find it, thanks. – Mannen Aug 23 '23 at 20:17
  • 1
    Good question. I looked in the network tab. [Here's a guide](https://stackoverflow.com/a/66878732/6243352). It's in Python, but the basic concept applies equally here. – ggorlen Aug 23 '23 at 20:20