This script scrolls a page with infinite scrolling and captures all the links.
It moves towards the bottom repeatedly loading new content each time
- How can I return the results?
- Moreover, how can I return results in chunks, avoiding appending partial results to the same array?
The script:
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false,
userDataDir: "C:\\Users\\johndoe\\AppData\\Local\\Google\\Chrome\\User Data\\Default"
});
const page = await browser.newPage();
await page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1,
});
await page.goto('https://www.facebook.com/groups/000000000000/members',{waitUntil: 'networkidle0'});
page.on('console', msg => console.log('PAGE LOG:', msg.text())); //sottoscrivo l'evento console e lo recupero nell'evaluate
let rawMembers = await page.evaluate(() => {
const intervall = 3000;
let stop = false;
document.addEventListener('keypress', e => stop = true); //press a key to exit
let results = [];
let pageHeigth = 0;
let timerId = setTimeout(function tick() {
if ((stop === false) && (document.body.scrollHeight > pageHeigth)){
pageHeigth = document.body.scrollHeight //save the current page heigth
document.scrollingElement.scrollTop = pageHeigth; //move the scroll to the end of the page (page visible size), this will couse new content to be loaded - virtula scroll)
console.log('PAGE HEIGTH: ', pageHeigth);
//do the work (capture what i need, all the links in my case)
const anchors = Array.from(document.querySelectorAll('a'));
const serializableLinks = anchors.map(x => x.getAttribute("href")); //convert to something serializable (string)
results.concat(serializableLinks);
timerId = setTimeout(tick, intervall); //schedule a new timeout to repeat the function
}
else
{
clearTimeout(timerId)
console.log('Exit');
return results;
}
}, intervall);
});
//await browser.close();
})();