1

I've tried this code


const cheerio = require("cheerio");
const axios = require('axios');

async function getProducts() {
  try{
    
    const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
    const html = await res.data;
    const $ = cheerio.load(html);

    const products = [];
    $('ul[data-testid]').each((i, el) => {
      const title = $(el).find('a[data-testid="product_name"]').text().trim();
      const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
      products.push({ title, price });
    });

    console.log(products);
  }catch(err){
    console.log(err)
  }
};

getProducts();

I need the product list array containing title and price but this code returning me empty array. What to do for getting these details? Example link: https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup.

Amazon work but this carefour website not working for web scraping!

const cheerio = require("cheerio");
const axios = require('axios');

async function getProducts() {
  try{
    
    const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
    const html = await res.data;
    const $ = cheerio.load(html);

    const products = [];
    $('ul[data-testid]').each((i, el) => {
      const title = $(el).find('a[data-testid="product_name"]').text().trim();
      const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
      products.push({ title, price });
    });

    console.log(products);
  }catch(err){
    console.log(err)
  }
};

getProducts();

Tried this and expecting to get details and price of products using cheerio- Nodejs

David Makogon
  • 69,407
  • 21
  • 141
  • 189
  • The page is JS-driven, so `product_name` doesn't appear in the HTML returned by axios. Try Puppeteer or Playwright. – ggorlen Feb 07 '23 at 16:32
  • Does this answer your question? [How can I scrape pages with dynamic content using node.js?](https://stackoverflow.com/questions/28739098/how-can-i-scrape-pages-with-dynamic-content-using-node-js) – ggorlen Feb 07 '23 at 16:32
  • @ggorlen I wan't able to find out the product_name using puppeteer. Could you please make a scraping code for https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup – Mahinur Rahman Feb 08 '23 at 15:54
  • If you post your attempt as a new question, I'll see it and try to answer it. You could also [edit] this one since nobody has answered it. – ggorlen Feb 08 '23 at 17:17

1 Answers1

0

Replying to an old question, but if you don't want to use puppeteer, you can still get the data from this site (and such others) if they have their data on every page in JSON format within script tags, this site has it in <script id="__NEXT_DATA__">...</script>,

For products on multiple pages, look at the URL, in this case, a Load More button appears when there are too many products to display, and clicking on it changes the url from

https://www.carrefouruae.com/mafuae/en/v4/search?keyword=still%20water

to

https://www.carrefouruae.com/mafuae/en/v4/search?currentPage=1&filter=&keyword=still%20water&pageSize=60&sortBy=relevance

the rest is get the page add products to array until no product data is returned.

const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
let fsp = fs.promises;

(async () => {

    async function getProducts(keyword, page = 0) {
        try {
            let all = [];
            let url = `https://www.carrefouruae.com/mafuae/en/v4/search?currentPage=${page}&filter=&keyword=${encodeURI(keyword)}&pageSize=60&sortBy=relevance`

            const response = await axios.get(url);
            const $ = cheerio.load(response.data);

            let raw = $('script#__NEXT_DATA__').text();
            let parsed = JSON.parse(raw);
            let products = parsed.props.initialState.search.products; // take a look here to see other fields that you can get, add them to the return below.

            products = products.map( x => {
                return {
                    name : x.name,
                    brand : x.brand,
                    size : x.size,
                    price : x.originalPrice,
                    currency : x.currency,
                    discount : x.discount,
                    url : x.url,
                    image : x.image.href,
                    otherImages : x.imageLibrary.map(el => el.href),
                    min : x.min,
                    max : x.max,  
                    origin : x.productOrigin,                 
                    supplier : x.supplier                    
                }
            });

            if (products.length != 0) { // page still returning products continue
                all.push(products);
                all = [...all, ...(await getProducts(keyword, page+1))].flat(); 
            } 

            // write everything to JSON file, 
            const data = JSON.stringify(all, null, 2); 
            await fsp.writeFile('searchResults.json', data);

            return all;
            
        } catch (error) {
            console.error(error);
        }
    }

let result = await getProducts('still water');

console.log(result);
console.log(result.length); // check last product manually as the site displays a wrong product count.

})().catch(err => console.error(err));
idchi
  • 761
  • 1
  • 5
  • 15