I want to scrape all the items' information from this page https://allinone.pospal.cn/m#/categories, refer to the screenshot.
the content is loading dynamically when I scroll down the right side of the page to the bottom and this webpage has two scrollbars. I tried a few times; however, I still failed to scroll it, so I can't get the item information on the next pages (next scroll), only able to extract the first 20 items, but it has about 1500+ items.
please kindly help me with this.
my code is as below:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Load the webpage
url = 'https://allinone.pospal.cn/m#/categories'
driver = webdriver.Chrome()
driver.get(url)
# Wait for the promotion image to load and click it
promotion_image = WebDriverWait(driver, 10).until(
EC.presence_of_element_located(
(By.XPATH, '//img[@src="//imgw.pospal.cn/we/westroe/img/categories/discount.png"]'))
)
promotion_image.click()
# get focus on the right side of the page (two scroll bar, focus on the right)
# Wait for the div to load and get focus on the element
items_div = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, 'div.yb-scrollable'))
)
#items_div.send_keys(Keys.NULL)
items_div.click()
# Do something with the div, e.g. get its text content
#@print(items_div.text)
#-
#### attempt 1 - to scroll
##Scroll to the bottom of the page
# scroll_pause_time = 1
# scroll_step = 500
# scroll_height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")
# while True:
# driver.execute_script(f"window.scrollTo(0, {scroll_height});")
# scroll_height_new = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")
# if scroll_height_new == scroll_height:
# break
# scroll_height = scroll_height_new
# time.sleep(scroll_pause_time)
# -
# -
#### attempt 2
# """A method for scrolling to the bottom of the page."""
# # Get scroll height.
# last_height = driver.execute_script("return document.body.scrollHeight")
# while True:
# # Scroll down to the bottom.
# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# # Wait to load the page.
# time.sleep(2)
# # Calculate new scroll height and compare with last scroll height.
# new_height = driver.execute_script("return document.body.scrollHeight")
# if new_height == last_height:
# break
# last_height = new_height
#### attempt 3
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
lastCount = lenOfPage
time.sleep(3)
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount==lenOfPage:
match=True
# Extract the content of the yb-item tags
soup = BeautifulSoup(driver.page_source, 'html.parser')
yb_items = soup.find_all('div', {'class': 'yb-item'})
for yb_item in yb_items:
print(yb_item.text.strip())
# Close the browser window
driver.quit()