4

So I am trying to pull out information using data scraping from this real estate website (https://www.zillow.com/vancouver-bc/) I am able to get all the information about the listing on the page but with images (image links/src), after a few of them, the result is some garbage. I tried researching and found it was because of lazy loading. For which is tried almost all the methods available and answered by others but none seem to work - this includes scrolling to the bottom, scrolling with delays (https://www.npmjs.com/package/puppeteer-autoscroll-down), zooming out the browser as much as I can to get the images to render. But it still doesn't work. I have been looking everywhere for hours now before I decided to post my question and code here itself for anyone else to figure it out.

let cheerio        = require('cheerio')
let puppeteer      = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent      = require('random-useragent')
const baseURL      = "https://www.zillow.com/vancouver-bc"
let estateData     = []
let urlLinks       = []

let scrollPageToBottom = require('puppeteer-autoscroll-down')


let getEstateData = async () => {
    estateData = []
    urlLinks   = []
    let url
    for (let pgNum = 1; pgNum <= 1; pgNum++) {
        if (pgNum === 1) {
            url = baseURL + "/"
        } else {
            url = baseURL + ("/" + pgNum + "_p")
        }
        urlLinks.push(url)
    }
    await searchWebsite()
    console.log("search over")
    return estateData
    //module.exports = estateData
}

let searchWebsite = async () => {
    await puppeteer
        .launch({headless : false})
        .then(async function (browser) {
            let page = await browser.newPage();
            // await page.setRequestInterception(true)
            //
            // page.on('request', (req) => {
            //     if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
            //         req.abort()
            //     }
            //     else {
            //         req.continue()
            //     }
            //
            // })

            let html
            await page.setUserAgent(userAgent.getRandom())
            for(let url of urlLinks){
                console.log(url)
                await page.goto(url).then(async function () {
                    html = await page.content();
                    let obj = await cheerio('.list-card-link.list-card-info', html)
                    let imgObj = await cheerio(".list-card-top", html)
                    let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)


                    // await page.waitForSelector('img',{
                    //     visible: true,
                    // })
                    // await page.evaluate(() => { window.scrollTo(0, document.body.scrollHeight)})
                    const scrollStep = 250 // default
                    const scrollDelay = 100 // default
                    const lastPosition = await scrollPageToBottom(page, scrollStep, scrollDelay)
                    await page.waitFor(2000)

                    let num = 0
                    console.log(obj.length)
                    for (let key in obj) {
                        if (obj[key].attribs) {
                            try {
                                let geoStr = await geoLocation[0].children[0].children[0].children[0].data
                                let geoObj = await (JSON.parse(geoStr)["geo"])

                                let extractedInfo = {
                                    estateName : await obj[key].children[0].children[0].data,
                                    estatePrice : await obj[key].children[2].children[0].children[0].data,
                                    saleType : await obj[key].children[1].children[0].next.data,
                                    estateConfig : {
                                        beds :  await obj[key].children[2].children[1].children[0].children[0].data,
                                        bath :  await obj[key].children[2].children[1].children[1].children[0].data,
                                        area :  await obj[key].children[2].children[1].children[2].children[0].data
                                    },
                                    estateLocation : {
                                        longitude : await geoObj.longitude,
                                        latitude : await geoObj.latitude
                                    },
                                    estateLink : await obj[key].attribs.href,
                                    estateCoverImgLink : await imgObj[num++].children[2].children[0].attribs.src
                                }
                                console.log(extractedInfo.estateName, imgObj[num].children[2].children[0].attribs.src)
                                await estateData.push(extractedInfo)
                            }
                            catch (e) {
                                console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
                                console.log(e)
                            }
                        }
                    }
                    console.log(estateData.length)
                });
            }
            //Now read the page

            console.log("total - ", estateData.length)
            await page.close()
            await browser.close()
        })
        .catch(function (err) {
            console.log(err)
        });
}

module.exports.getEstateData = getEstateData
Aryan Arora
  • 143
  • 2
  • 12
  • Hey! I put together this gist for you. This is just old code that I pulled from an old project. This is what I have used in the past to scroll the page in puppeteer. It might not 100% work, but it should give you some direction: https://gist.github.com/maxrbaldwin/c6e10f3184af0660081abd5732ebb3ba – Max Baldwin Jul 15 '19 at 17:22
  • 1
    @MaxBaldwin thanks for the quick response. I tried to scroll the whole page before using my loops to get the data but the same thing is happening. Even though the page images seems to have been loaded completely when scraping data (image )src comes out to be some garbage ("'") – Aryan Arora Jul 15 '19 at 17:48
  • @MaxBaldwin is it possible to scroll to a specific element that I can provide with cheerio ? – Aryan Arora Jul 15 '19 at 17:51
  • Does this answer your question? [Puppeteer - scroll down until you can't anymore](https://stackoverflow.com/questions/51529332/puppeteer-scroll-down-until-you-cant-anymore) – ggorlen Nov 22 '20 at 16:02

2 Answers2

3

I had a similar issue and found a working answer here. Hopefully this works for you too. The interval was a little slow so I changed it from 100 to 30.

Ludolfyn
  • 1,806
  • 14
  • 20
  • 1
    Please summarize the contents of the link in the question itself (without plagiarizing). Since it's just another SO post, the typical approach is to leave a comment or vote to close as a duplicate if you have sufficient rep. See [your answer is in another castle](https://meta.stackexchange.com/questions/225370/your-answer-is-in-another-castle-when-is-an-answer-not-an-answer). Thanks. – ggorlen Nov 22 '20 at 16:01
  • 2
    A link to a solution is welcome, but please ensure your answer is useful without it: [add context around the link](//meta.stackexchange.com/a/8259) so your fellow users will have some idea what it is and why it’s there, then quote the most relevant part of the page you're linking to in case the target page is unavailable. [Answers that are little more than a link may be deleted.](/help/deleted-answers) – Sabito stands with Ukraine Nov 22 '20 at 16:02
1

I was able to solve this with a pretty simple implementation using the puppeteer-autoscroll-down library as you mentioned. I'm not sure which images you were specifically attempting to grab, but this worked for me.

// Set the initial viewport and navigate to the page
await page.setViewport({ width: 1300, height: 1000 });
await page.goto('https://www.zillow.com/vancouver-bc/', { waitUntil: 'load' });

// Scroll to the very top of the page
await page.evaluate(_ => {
      window.scrollTo(0, 0);
});

// Scroll to the bottom of the page with puppeteer-autoscroll-down
await scrollPageToBottom(page);

// Get your image links
let imageLinks = await page.$$eval('.list-card img', imgLinks => {
    return imgLinks.map((i) => i.src);
});

imageLinks was an array with 40 fully formed links, https://photos.zillowstatic.com/p_e/ISz7wlfm278p501000000000.jpg is one example.

Hope that helps you, this was a pretty brutal one for me to solve as well.

pmcnamee
  • 317
  • 2
  • 8