I want to scrape all the products from this site which does not have a pagination button. The products load automatically when you scroll. My script can only scrape the first 40 products. I realized that the products load dynamically in a data-page attribute of the div tag ? ? I want my script to keep changing the data-page value and load the product but i dont know how to do it.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
url = 'https://www.positivepromotions.com/custom-blankets/c/navpp_1001_114/'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'}
result = requests.get(url, headers=headers, timeout=5000)
data = result.content.decode()
soup1 = BeautifulSoup(data,'lxml')
## get the category
## get the conbtainer first container
subcategory = soup1.find('h1').text.strip()
itemlist = []
for soup in soup1.find_all('div', class_='row cat-prod-list'):
for x in range(1,4):
#for pages in soup.find_all('div', id='categoryProducts', attrs={'data-page': True}):
for pages in soup.select('div[data-page]', id='categoryProducts'):
print(pages['data-page'])
for productList in pages.find_all('div', class_='col-sm-4 col-md-3 cat-prod-container'):
title = productList.find('a', class_='product-title').text.strip()
price = productList.find('span', class_='cat-price').text.strip().split('-',1)[0]
sku = productList.find('div', class_='grid-prod-sku').text.strip()
#productlist = soup.find_all('div', class_='prod-img-wrap')
links = productList.find('a', class_='cat-prod-img',href=True)['href']
image = productList.find('img')['data-src'].split('?',1)[0]
items = {
'Title': title,
'Price': price,
'Sku': sku,
'Category': subcategory,
'Link': links,
'Image': image
}
itemlist.append(items)
##print('Saving : ',title)
#time.sleep(1)
# print total products found
print(len(itemlist))
#df = pd.DataFrame(itemlist)
##print(df.head(5))
#df.to_csv(subcategory+'.csv')
###