I have a script that I use to scrape some data from a financial data website. It works perfectly when I run it locally, or on a server in a tmux session, but when I try to run it with Airflow, I experience countless problems that seem to only arise within the execution happening as a result of Airflow's execution.
Some of the errors I encounter are:
no such window: target window already closed
stale element reference: element is not attached to the page document
(sometimes resolvable by adding a sleep or timer)
among a few others. Is there a specific way to deal with long running webdriver sessions? I have temporarily resorted to moving my iteration loop above the instantiation of the webdriver, which is working fine when I run it, but still does not seem to work fully with the WebDriver.
Here is the code for the job I'm building:
from datetime import datetime
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
import redis
r = redis.Redis(host='localhost', port=6379, db=0)
import os, sys
import pandas as pd
options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-setuid-sandbox")
options.add_argument("--remote-debugging-port=9222") # this
options.add_argument("--disable-dev-shm-using")
options.add_argument("--disable-extensions")
options.add_argument("--disable-gpu")
options.add_argument("--headless")
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
options.add_argument(f'user-agent={user_agent}')
fv_login_page = "https://finviz.com/screener.ashx"
executable_path='/usr/local/bin/chromedriver'
def create_dataframe(pages=383, datadir_location="/home/arthur/datadir"):
today_date = f"{datetime.strftime(datetime.now(), '%Y-%m-%d')}"
if not os.path.exists(datadir_location):
os.mkdir(datadir_location)
fundamentals_dir_location = f"{datadir_location}/fundamentals"
if not os.path.exists(fundamentals_dir_location):
os.mkdir(fundamentals_dir_location)
newdir = f"{fundamentals_dir_location}/{today_date}"
if not os.path.exists(newdir):
os.mkdir(newdir)
output_file = f"{newdir}/fundamentals.csv"
symlink_name = f"{fundamentals_dir_location}/latest_fundamentals.csv"
finviz_tickers = []
depth = pages
page_numbers = list(range(1,depth)) # 1,383
page_number_strs = []
for pn in page_numbers:
page_number_strs.append("Page {0}/{1}".format(str(pn), 382))
xpage = "https://finviz.com/screener.ashx?v=111&r="
table = []
for page in range(1, depth*20+1, 20):
driver = webdriver.Chrome(chrome_options=options,executable_path=executable_path)
driver.implicitly_wait(1)
start_date = datetime.strftime(datetime.now(), '%Y-%m-%d')
table = []
driver.get(str(xpage) + str(page))
rows = driver.find_elements_by_xpath("//div[@id='screener-content']/table/tbody/tr[4]/td/table/tbody/tr")
for index, row in enumerate(rows):
tmp_row = []
if index == 0:
continue # we skip the header row that we see every single page
else:
for i in range(1,11):
td_elements = row.find_elements_by_tag_name("td")
if td_elements is not None:
tmp_row.append(td_elements[i].text)
print(tmp_row)
table.append(tmp_row)
driver.quit()
col_names = open(f"{os.environ['AIRFLOW_HOME']}/jobs/nightwatch/seeds/ontology/finviz_columns.txt").readlines()
col_names = list(map(lambda x: x.replace("\n", "").replace("\s+",""), col_names))
pd.DataFrame(table).to_csv(output_file, header=col_names, index=False)
if os.path.exists(symlink_name):
os.unlink(symlink_name)
print("Updated list of tickers can be found at:\n")
print("\t", output_file+".scalable")
os.symlink(output_file, symlink_name)
create_dataframe(pages=384)
Any help/tips would be appreciated. I just find it odd that it runs perfectly (on the same host for that matter) but when I schedule it on airflow it runs for a while and then fails with some WebDriver error. Are there extra considerations that I need to take?