1

I am creating a Twitter scraper as a project. Tweets are rendered in the DOM as you scroll down so I want to use Puppeteer to scroll, extract data and save it into a predefined object, then continue scrolling. The problem is that the script is not actually modifying the object provided and I am left with an empty object.

The for loop to extract data works when called outside the scrolling function (i.e. I can extract the first tweets rendered in the page). The scrolling function itself works, I got it from Puppeteer - scroll down until you can't anymore .

For testing purposes I set the scrolling function to only scroll 20 times (it is otherwise designed to scroll until it can't scroll anymore). Here is my code:

app.get('/scrape', async (req, res) => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.setJavaScriptEnabled(true)
    await page.goto(`https://twitter.com/${req.query.url}`);
    await page.setJavaScriptEnabled(true)
    let obj = {}
    await autoScroll(page, obj)
    async function autoScroll(page, obj) {
        await page.evaluate(async (obj) => {
            await new Promise((resolve, reject) => {
                var totalHeight = 0;
                var distance = 400;
                var count = 0
                var timer = setInterval(() => {
                    var scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    for (let i = 0; i < 100; i++) {
                        let id, date, text
                        try {
                            id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
                            date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
                            text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
                            obj[id] = { date: date, text: text }
                            console.log(i)
                        } catch (err) { continue }
                    }
                    count++
                    //if(totalHeight >= scrollHeight){
                    if (count === 20) {
                        clearInterval(timer);
                        resolve();
                    }
                }, 400);
            });
        }, obj);
    }
    res.send(obj)
    await browser.close();
})

The request sends an empty object every time. I don't receive any error messages or console logs; if they are there, I can't see them because they are executed in the context of the headless Chrome browser than Puppeteer generates.

Any help would be appreciated!

Nick
  • 396
  • 1
  • 3
  • 19

1 Answers1

1

The arguments you pass to page.evaluate will be JSON-serialized and transferred to the page context.

The properties you assign to obj in your page.evaluate() function will only be present in the page context, not in the script where you called page.evaluate.

You can work around this by returning the obj object from the function instead of passing it as parameter:

let obj = await page.evaluate(async() => {
  return new Promise(resolve => {
      let obj = {};
      // ...
      // set something on obj
      obj['foo'] = 'bar';

      // resolve with the obj
      resolve(obj);
      // ...
  });
});

Integrated in your code snippet:

app.get('/scrape', async (req, res) => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.setJavaScriptEnabled(true)
    await page.goto(`https://twitter.com/${req.query.url}`);
    await page.setJavaScriptEnabled(true)
    let obj = await autoScroll(page);
    async function autoScroll(page) {
        return page.evaluate(async () => {
            let obj = {};
            return new Promise((resolve, reject) => {
                var totalHeight = 0;
                var distance = 400;
                var count = 0
                var timer = setInterval(() => {
                    var scrollHeight = document.body.scrollHeight;
                    window.scrollBy(0, distance);
                    totalHeight += distance;
                    for (let i = 0; i < 100; i++) {
                        let id, date, text
                        try {
                            id = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].getAttribute('data-tweet-id')
                            date = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[1].getAttribute('title')
                            text = document.body.childNodes[7].childNodes[3].childNodes[1].childNodes[5].childNodes[1].childNodes[1].childNodes[3].childNodes[1].childNodes[3].childNodes[7].childNodes[1].childNodes[3].childNodes[1].childNodes[i].childNodes[1].childNodes[3].childNodes[3].childNodes[1].innerHTML
                            obj[id] = { date: date, text: text }
                            console.log(i)
                        } catch (err) { continue }
                    }
                    count++
                    //if(totalHeight >= scrollHeight){
                    if (count === 20) {
                        clearInterval(timer);
                        resolve(obj);
                    }
                }, 400);
            });
        });
    }
    res.send(obj)
    await browser.close();
})

If you're using a transpiler like babel you might need to pass the function as a string to page.evaluate, e.g.:

await page.evaluate(`async() => {
  return Promise.resolve(42);
}`);

(puppeteer will call .toString() on your function to get the source, which might contain references to helpers used by babel, which aren't present in the page context)

Edit:
To debug your selectors you can try to launch puppeteer in non-headless mode. That way you get a real browser window where you can access the dev console. e.g.:

const browser = await puppeteer.launch({headless: false});
Turtlefight
  • 9,420
  • 2
  • 23
  • 40
  • Thank you, I am not using babel to transpile this file. I implemented your code and it returned undefined. I added another return statement as "return await new Promise((resolve, reject) => {" and got an empty object as my result. – Nick Aug 16 '19 at 14:31
  • After further testing I tried adding obj.a = 1 to my for loop and it did return that, so I think your code is working and it is my try/catch that is failing to return any data. Should I be using id = await page.evaluate(i => document.body.childNode[7]... , i) inside another evaluate function? I am not sure where the problem in my logic is here. – Nick Aug 16 '19 at 14:39
  • @NickP you're correct i forgot to add the return, fixed it. You could try removing the try / catch and see what errors you are getting. Probably one of your `.childNodes[]` returns null because the element is not there. You can also use [`document.querySelector`](https://developer.mozilla.org/de/docs/Web/API/Document/querySelector) or `document.querySelectorAll` if you can target the node you want with css selectors – Turtlefight Aug 16 '19 at 14:54
  • I tried making my setInterval async, passing page into my evaluate function properly, and using the "id = await page.evaluate(i => document.body.childNode[7]... , i)" format instead. I got a circular JSON error. As to your comment, unfortunately there are no selectors to target. The nested childNodes were working when I didn't have them wrapped within the scroll function, so I'm not sure. I will keep playing with it. Sadly I cannot see the specific errors within my evaluate function since they show up in the invisible headless Chrome instance which makes debugging especially tough. – Nick Aug 16 '19 at 14:55
  • 1
    @NickP you can't pass the `page` into evaluate. All arguments you pass to `evaluate` must be either JSON serializable (plain objects or primitives) or JSHandles (a handle referring to an object in the page) – Turtlefight Aug 16 '19 at 14:57
  • @NickP you could try launching puppeteer in non-headless mode, that way you can see the logs within the evaluate function in the dev console of the window & pause on exceptions to see where your code is failing – Turtlefight Aug 16 '19 at 15:06