I am creating a Twitter scraper as a project. Tweets are rendered in the DOM as you scroll down so I want to use Puppeteer to scroll, extract data and save it into a predefined object, then continue scrolling. The problem is that the script is not actually modifying the object provided and I am left with an empty object.
The for loop to extract data works when called outside the scrolling function (i.e. I can extract the first tweets rendered in the page). The scrolling function itself works, I got it from Puppeteer - scroll down until you can't anymore .
For testing purposes I set the scrolling function to only scroll 20 times (it is otherwise designed to scroll until it can't scroll anymore). Here is my code:
app.get('/scrape', async (req, res) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setJavaScriptEnabled(true)
await page.goto(`https://twitter.com/${req.query.url}`);
await page.setJavaScriptEnabled(true)
let obj = {}
await autoScroll(page, obj)
async function autoScroll(page, obj) {
await page.evaluate(async (obj) => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 400;
var count = 0
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
for (let i = 0; i < 100; i++) {
let id, date, text
try {
id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
obj[id] = { date: date, text: text }
console.log(i)
} catch (err) { continue }
}
count++
//if(totalHeight >= scrollHeight){
if (count === 20) {
clearInterval(timer);
resolve();
}
}, 400);
});
}, obj);
}
res.send(obj)
await browser.close();
})
The request sends an empty object every time. I don't receive any error messages or console logs; if they are there, I can't see them because they are executed in the context of the headless Chrome browser than Puppeteer generates.
Any help would be appreciated!