1

So I'm building a program that scrapes Poshmark webpages and extracts the usernames of each seller on the page!

I want it to go through every page using the 'next' button, but theres 6 buttons all with the same class name...

Heres the link: https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false

(In my google chrome this page has an infinite scroll (hence the scrollToBottom async function i started writing) but i realized inside puppeteer's chrome it has 'next page' buttons.)

The window displays page 1-5 and then the 'next page' button.

The problem is that all of the buttons share the same html class name, so I'm confused on how to differentiate.

const e = require('express');
const puppeteer = require('puppeteer');
const url = "https://poshmark.com/category/Men-Jackets_&_Coats?sort_by=like_count&all_size=true&my_size=false";
let usernames = [];

 const initItemArea = async (page) => {

    const itemArea = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
    });
 }

 const pushToArray =  async (itemArea, page) => {

    itemArea.forEach(function (element) {
        //console.log('username: ', $(element).text());
        usernames.push(element);
    });

 };

 const scrollToBottom = async (itemArea, page) => {

    while (true) {

        previousHeight = await page.evaluate('document.body.scrollHeight');
        await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
        await page.waitForFunction(`document.body.scrollHeight > ${previousHeight}`);
    
        await new Promise((resolve) => setTimeout(resolve, 1000));

        await page.screenshot({path : "ss.png"})
    }
};


const gotoNextPage = async (page) => {
    await page.waitForSelector(".button.btn.btn--pagination");

    const nextButton = await page.evaluate((page) => {
        document.querySelector(".button.btn.btn--pagination")
    });
    
    await page.click(nextButton);
    console.log('Next Page Loading')

};


async function main() {
 
    const client = await puppeteer.launch({
        headless: false,
        executablePath: "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
    });

    const page = await client.newPage();
    await page.goto(url);
    await page.waitForSelector(".tc--g.m--l--1.ellipses");

    const itemArea = await page.evaluate(() => {
        return Array.from(document.querySelectorAll('.tc--g.m--l--1.ellipses')).map(x => x.textContent);
    });


    gotoNextPage(page)
    
};

main();

Currently, my gotoNextPage function doesnt even find the button, so i thought i'd entered the selector wrong...

Then when I went to find the selector, I realized all buttons have the same one anyway...

My html knowledge is basically nonexistent, but I want to finish this project out. All help is very appreciated.

Bonus: my initPageArea function doesn't work when I call as a function like that, so I hardcoded it into main()...

I'll be diving deep into this problem later on, as I've seen it before, but any quick answers / direction would be awesome.

Thanks a lot.

2 Answers2

0

Whenever you're messing with buttons and scroll, it's a good idea to think about where the data is coming from. It's usually being delivered to the front-end via a JSON API, so you might as well try to hit that API directly rather than mess with the DOM.

const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;

(async () => {
  const usernames = [];

  for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
    const response = await fetch(url(maxId)); // Node 18 or install node-fetch

    if (!response.ok) {
      throw Error(response.statusText);
    }

    const payload = await response.json();

    if (payload.error) {
      break;
    }

    usernames.push(...payload.data.map(e => e.creator_username));
  }

  console.log(usernames.slice(0, 10));
  console.log("usernames.length", usernames.length);
})()
  .catch(err => console.error(err));

The response blob has a ton of additional data.

I would add a significant delay between requests if I were to use code like this to avoid rate limiting/blocking.


If you're set on Puppeteer, something like this should work as well, although it's slower and I didn't have time to run to the end of the 5k (or more?) users:

const puppeteer = require("puppeteer"); // ^19.1.0

const url = "Your URL";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.goto(url, {waitUntil: "domcontentloaded"});

  const usernames = [];
  const sel = ".tc--g.m--l--1.ellipses";

  for (;;) {
    try {
      await page.waitForSelector(sel);
      const users = await page.$$eval(sel, els => {
        const text = els.map(e => e.textContent);
        els.forEach(el => el.remove());
        return text;
      });
      console.log(users); // optional for debugging
      usernames.push(...users);
      await page.$$eval(
        ".btn--pagination",
        els => els.find(el => el.textContent.includes("Next")).click()
      );
    }
    catch (err) {
      break;
    }
  }

  console.log(usernames);
  console.log(usernames.length);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

I don't think navigations are triggered by the "Next" button, so my strategy for detecting when a page transition has occurred involves destroying the current set of elements after scraping the usernames, then waiting until the next batch shows up. This may seem inelegant, but it's easy to implement and seems reliable, not making assumptions about the usernames themselves.

It's also possible to use Puppeteer and make or intercept API requests, armed with a fresh cookie. This is sort of halfway between the two extremes shown above. For example:

const puppeteer = require("puppeteer");

const url = "Your URL";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const usernames = await page.evaluate(async () => {
    const url = maxId => `https://poshmark.com/vm-rest/channel_groups/category/channels/category/collections/post?request={%22filters%22:{%22department%22:%22Men%22,%22category_v2%22:%22Jackets_%26_Coats%22,%22inventory_status%22:[%22available%22]},%22sort_by%22:%22like_count%22,%22facets%22:[%22color%22,%22brand%22,%22size%22],%22experience%22:%22all%22,%22sizeSystem%22:%22us%22,%22max_id%22:%22${maxId}%22,%22count%22:%2248%22}&summarize=true&pm_version=226.1.0`;
    const usernames = [];

    try {
      for (let maxId = 1; maxId < 5 /* for testing */; maxId++) {
        const response = await fetch(url(maxId)); // node 18 or install node-fetch
      
        if (!response.ok) {
          throw Error(response.statusText);
          break;
        }
 
        const json = await response.json();
      
        if (json.error) {
          break;
        }
      
        usernames.push(...json.data.map(e => e.creator_username));
      }
    }
    catch (err) {
      console.error(err);
    }

    return usernames;
  });

  console.log(usernames);
  console.log("usernames.length", usernames.length);
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

The above code limits to 4 requests to keep it simple and easy to validate.

Blocking images and other unnecessary resources can help speed the Puppeteer versions up, left as an exercise (or just use the direct fetch version shown at top).

ggorlen
  • 44,755
  • 7
  • 76
  • 106
0

you can try selecting the buttons using their position in the page.

For example, you can select the first button using the following CSS selector:

.button.btn.btn--pagination:nth-child(1)

to select the second button:

.button.btn.btn--pagination:nth-child(2)

Got the idea? :)

you can refactor your gotoNextPage function to use this approach, consider this example:

const gotoNextPage = async (page, buttonIndex) => {      
  await page.waitForSelector(".button.btn.btn--pagination");

  // Select the button using its position in the page
  const nextButton = await page.evaluate((buttonIndex) => {
    return document.querySelector(`.button.btn.btn--pagination:nth-child(${buttonIndex})`);
  }, buttonIndex);

  // Click on the button
  await page.click(nextButton);
  console.log("Next Page Loading");
};
Mohammed Swillam
  • 9,119
  • 4
  • 36
  • 47