I have a scraping program that goes to youtube and fetch the img src of every thumbnail. The problem with it is even though the programs navigation to the src url and recording it works youtube has a infinite scrolling feature that hat ensures the client browser only fetches the items once the user scrolled them into view. So to deal with the infinite scrolling I added a function that scroll to the part of the page that can create more content at the bottom and selectors i.e. document.querySelectorAll('.class_name').length
to check whether more content has been generated but the program still doesn't deal with the infinite scrolling issue. I can't figure out what I missed. Any help is appreciated. Thanks in advance.
async function scrape(url) {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto(url, {
timeout: 0
});
await page.setViewport({
width: 1200,
height: 800
});
const delay = 3000;
let preCount = 0;
let postCount = 0;
do {
preCount = await getCount(page);
await scrollDown(page);
await page.waitFor(delay);
postCount = await getCount(page);
} while (postCount > preCount);
await page.waitFor(delay);
const srcSelector = 'ytd-thumbnail > a > yt-img-shadow > #img'
const ytSrcData = await page.$$eval(srcSelector, elems => elems.map(el => el.src).join('\n'))
browser.close();
console.log({
ytTextData,
ytSrcData
})
}
async function getCount(page) {
return await page.$$eval('.ytd-rich-item-renderer', a => a.length);
}
async function scrollDown(page) {
await page.$eval('.ytd-rich-item-renderer:last-child', e => {
e.scrollIntoView({
behavior: 'smooth',
block: 'end',
inline: 'end'
});
});
}