I've built a script that uses selenium, it works well, however the site I am scraping infinitely loads, and so built in something to manage this.
However every-time it scrolls down it re scrapes the data it scraped before!
How can I change the script to it only scrapes data that hasn't been scraped yet?
I have seen some questions similar to this, and added the some code based on them, however I think my case is slightly different!
Thanks!
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
import time
import os
import csv
browser = webdriver.Chrome(executable_path="/chromedriver")
browser.get("***url***")
filename ="fileName.csv"
f = open(filename, 'w')
headers ="Title, Date, Time\n "
f.write(headers)
browser.find_element_by_css_selector('').click()
time.sleep(3)
page = browser.find_elements_by_class_name('')
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")
t_end = time.time() + 60
while time.time() < t_end:
try:
for items in page:
title = items.find_element_by_class_name('').text.replace(',', '|')
date = items.find_element_by_class_name('').text
print('Name:',title)
print('Date:',date)
print("")
f.write(title + "," + date.split(" ")[0] + "," + date.split(" ")[1] + "\n")
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)
page = browser.find_elements_by_class_name('')
except:
break
f.close()
browser.quit()