I am scraping this website https://www.dccomics.com/comics
If you scroll all the way down you will find a browse comics
section with a pagination
I would like to scrape all 25 comics from pages 1-5
This is the code i currently have
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
class Scraper():
comics_url = "https://www.dccomics.com/comics"
driver = webdriver.Chrome("C:\\laragon\\www\\Proftaak\\chromedriver.exe")
# driver = webdriver.Chrome("C:\\laragon\\www\\proftaak-2020\\Proftaak-scraper\\chromedriver.exe")
driver.get(comics_url)
driver.implicitly_wait(500)
current_page = 2
def GoToComic(self):
for i in range(1,3):
time.sleep(2)
goToComic = self.driver.find_element_by_xpath(f'//*[@id="dcbrowseapp"]/div/div/div/div[3]/div[3]/div[2]/div[{i}]/a/img')
self.driver.execute_script("arguments[0].click();", goToComic)
self.ScrapeComic()
self.driver.back()
self.ClearFilter()
if self.current_page != 6:
if i == 25:
self.current_page +=1
self.ToNextPage()
def ScrapeComic(self):
self.driver.implicitly_wait(250)
title = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'page-title')]")))]
price = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'buy-container-price')]/span[contains(@class, 'price')]")))]
available = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.XPATH, "//div[contains(@class, 'sale-status-container')]/span[contains(@class, 'sale-status')]")))]
try:
description = [my_elem.text for my_elem in WebDriverWait(self.driver, 5).until(EC.visibility_of_all_elements_located((By.CLASS_NAME, "field-items")))]
except:
return
def ToNextPage(self):
if self.current_page != 6:
nextPage = self.driver.find_element_by_xpath(f'//*[@id="dcbrowseapp"]/div/div/div/div[3]/div[3]/div[3]/div[1]/ul/li[{self.current_page}]/a')
self.driver.execute_script("arguments[0].click();", nextPage)
self.GoToComic()
def AcceptCookies(self):
self.driver.implicitly_wait(250)
cookies = self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[4]/div[2]/div/button')
self.driver.execute_script("arguments[0].click();", cookies)
self.driver.implicitly_wait(100)
def ClearFilter(self):
self.driver.implicitly_wait(500)
clear_filter = self.driver.find_element_by_class_name('clear-all-action')
self.driver.execute_script("arguments[0].click();", clear_filter)
def QuitDriver(self):
self.driver.quit()
scraper = Scraper()
scraper.AcceptCookies()
scraper.ClearFilter()
scraper.GoToComic()
scraper.QuitDriver()
Now it scrapes the first 25 comics of the first page fine, but the problem arises when I go to the second page, It scrapes the first comic of page 2 fine, but when I go back to the page from the comic the filter will be reset and it will start at page 1 again.
How could I make it that it either resumes from the correct page or that the filter will always be off before going back to the comics page? i tried using something like sessions / cookies but it seems the filter is not being saved in anyway possible