5

I am trying to scrape a website which has infinite scrolling.

I am controlling the scroll but still, it exits after reaching at the end of the webpage.

This is my code:

const puppeteer = require("puppeteer");

module.exports.scraper = async (url, callBack) => {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.setUserAgent(
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    );

    await page.setViewport({ width: 1200, height: 768 });

    function wait(ms) {
        return new Promise((resolve) => setTimeout(() => resolve(), ms));
    }

    await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
        waitUntil: "networkidle0",
    });

    // Get the height of the rendered page
    const bodyHandle = await page.$("body");
    const { height } = await bodyHandle.boundingBox();
    await bodyHandle.dispose();

    // Scroll one viewport at a time, pausing to let content load
    const viewportHeight = page.viewport().height;
    let viewportIncr = 0;
    while (viewportIncr + viewportHeight < height) {
        await page.evaluate((_viewportHeight) => {
            window.scrollBy(0, _viewportHeight);
        }, viewportHeight);
        await wait(1600);
        viewportIncr = viewportIncr + viewportHeight;
    }

    let data = await page.evaluate(() => {
        window.scrollTo(0, 0);
        let products = [];
        let productElements = document.querySelectorAll(".product-wrap");

        productElements.forEach((productElement) => {
            let productJson = {};
            try {
                productJson.imageUrl = productElement.querySelector(".renderedImg").src;
                productJson.brandName = productElement.querySelector(
                    ".brand-name",
                ).innerText;
            } catch (e) {
                console.log(e);
            }
            products.push(productJson);
        });
        return products;
    });
    await wait(100);
    callBack(data, true);
    await browser.close();
};

How to scrape in such situation?

ggorlen
  • 44,755
  • 7
  • 76
  • 106
Sagar Chavan
  • 249
  • 5
  • 14
  • 1
    What do you mean by "it exits after reaching at the end of the webpage"? What should it do instead? It's not clear to me yet what problem you're trying to solve. – Todd Price Oct 06 '20 at 19:23
  • i need my script to wait for it to load content after scroll reaches at the bottom of the page. I am scraping the website which has an infinite scrolling function. – Sagar Chavan Oct 06 '20 at 19:26

1 Answers1

5

Here's one strategy to handle infinite scrolling. It repeats a scroll/compare in a loop until scrolling has no effect. i.e. when we tell it to scroll, but we're still at the same scrollTop value we were last iteration, consider it done. In extreme cases the browser will eventually run out of heap memory and crash, but this is our starting point for the average site:

const puppeteer = require('puppeteer');
const url = 'https://example.com';

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  page.on('console', async msg => {
    const args = msg.args();
    const vals = [];
    for (let i = 0; i < args.length; i++) {
      vals.push(await args[i].jsonValue());
    }
    console.log(vals.join('\t'));
  });
  await page.goto(url);
  await page.evaluate(()=> {
    
    const wait = (duration) => { 
      console.log('waiting', duration);
      return new Promise(resolve => setTimeout(resolve, duration)); 
    };

    (async () => {
      
      window.atBottom = false;
      const scroller = document.documentElement;  // usually what you want to scroll, but not always
      let lastPosition = -1;
      while(!window.atBottom) {
        scroller.scrollTop += 1000;
        // scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
        await wait(300);
        const currentPosition = scroller.scrollTop;
        if (currentPosition > lastPosition) {
          console.log('currentPosition', currentPosition);
          lastPosition = currentPosition;
        }
        else {
          window.atBottom = true;
        }
      }
      console.log('Done!');

    })();

  });

  await page.waitForFunction('window.atBottom == true', {
    timeout: 900000,
    polling: 1000 // poll for finish every second
  });

  await page.screenshot({path: 'example.png', fullPage: true});

  await browser.close();
})();
Todd Price
  • 2,650
  • 1
  • 18
  • 26