The script randomly times out. It worked fine for a while but all of a sudden I'm getting this error message every time I run it. The script scrapes a website clicking linked dates iteratively and grabs the raw data, concatenating it to an empty dataframe.
Here's the code:
import pandas as pd
from datetime import date, timedelta, datetime
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from io import StringIO
start_date = '04-26-2020'
final_date = '05-26-2020'
def get_dates(start_date, final_date):
start_date = datetime.strptime(start_date, '%m-%d-%Y')
#dates = pd.date_range(start_date, today - timedelta(days=1), freq='d')
dates = pd.date_range(start_date, final_date, freq='d')
dates = dates.strftime('%m-%d-%Y')
dates = dates.tolist()
return dates
options = FirefoxOptions()
options.add_argument("--start-maximized")
options.headless = True
assert options.headless
def scrape(start_date, final_date, path_to_click):
ff_driver = "/usr/bin/geckodriver"
site = "url"
driver = webdriver.Firefox(executable_path=ff_driver, options=options)
driver.get(site)
waiting = WebDriverWait(driver, 10)
df = pd.DataFrame(columns=column_names)
for dt in get_dates(start_date, final_date):
waiting.until(EC.element_to_be_clickable((By.XPATH, '//*[@title="{}.csv"]'.format(dt)))).click()
waiting.until(EC.element_to_be_clickable((By.XPATH, path_to_click))).click()
raw = driver.find_element_by_xpath('/html/body/pre').text
raw_data = StringIO(raw)
if dt != start_date:
df2 = pd.read_csv(raw_data)
replace_columns(df2, column_names)
df2['Last_Update'] = pd.to_datetime(df2['Last_Update'])
df = pd.concat([df, df2], axis=0, ignore_index=True)
else:
df1 = pd.read_csv(raw_data)
replace_columns(df1, column_names)
df1['Last_Update'] = pd.to_datetime(df1['Last_Update'])
df = pd.concat([df, df1], axis=0, ignore_index=True)
driver.back()
driver.back()
return df