I am trying to build a verge archives scraper for news headlines, my primary aim is to scrape data from a given month and year. The code was working some days back and it was scrolling properly but now it is unable to scroll and it is getting stuck every time. I am trying to scroll by the Action chain of CTRL+END but it does not work. I tried other ways too, but no luck
def scrolling_func(wait,driver):
print("It is trying to scroll")
SCROLL_PAUSE_TIME = 5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
ActionChains(driver).key_down(Keys.CONTROL).send_keys('END').key_up(Keys.CONTROL).perform()
load_button = driver.find_element_by_css_selector('.p-button')
# driver.execute_script("arguments[0].scrollIntoView();", load_button)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-button')))
# ActionChains(driver).move_to_element(load_button).click().perform()
load_button.click()
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# driver.delete_all_cookies()
time.sleep(1)
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()
And the scraper is
def scraper(years,months):
PATH = r"C:\Users\astar\Stock market tutorials\chromedriver_win64\chromedriver.exe"
options = webdriver.ChromeOptions()
options.use_chromium = True
options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=PATH,options=options)
driver.maximize_window()
urls = parse_dates(years,months)
final_headlines = []
final_dates = []
final_links = []
for url in urls:
driver.get(url)
done=True
while done:
try:
wait = WebDriverWait(driver,10)
scrolling_func(wait,driver)
except:
done=False
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()
soup = BeautifulSoup(driver.page_source,'lxml')
#https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
#https://stackoverflow.com/questions/42732958/python-parallel-execution-with-selenium
#https://stackoverflow.com/questions/44245451/how-to-scrape-multiple-html-page-in-parallel-with-beautifulsoup-in-python
#https://stackoverflow.com/questions/45816619/selenium-firefox-webdriver-for-python-keyerror-value
num_articles = soup.find("h1",class_="p-page-title").text
current = num_articles[num_articles.find("for")+4:num_articles.find("(")]
articles_num = num_articles[num_articles.find("(")+1:-1]
titles = soup.find_all("h2",class_="c-entry-box--compact__title")
dates = soup.find_all("time",class_="c-byline__item")
if articles_num != len(titles):
logger.warning("Actual #articles {} and #scraped articles {} for {}".format(articles_num,len(titles),current))
print(len(titles),len(dates))
headlines_results = map(title_extractor,titles)
dates_results = map(date_extractor,dates)
links_results = map(link_extractor,titles)
def list_process(gens):
return [gen for gen in gens]
headlines = list_process(headlines_results)
dates = list_process(dates_results)
links = list_process(links_results)
final_headlines.extend(headlines)
final_dates.extend(dates)
final_links.extend(links)
time.sleep(15)
print(len(final_headlines),len(final_dates),len(final_links))
assert len(final_headlines)==len(final_dates)==len(final_links), f'Different lengths of headlines {len(headlines)} and date {len(dates)}'
data = {"Headlines":final_headlines,"Dates":final_dates,"Links":final_links}
df = pd.DataFrame(data)
df.to_csv('file1.csv')
return df
if name == "main": scraper(["2021"],["3"])
As I said, it is unable to scroll, it was working well some days back but now it is breaking. Also earlier I had the issue of unable to load the entire list of the page as it was getting stuck. Can anyone help me here? Thanks in advance.