I am new to python and looking to implement Luigi into some of my python data processing scripts. I have two tasks, one task will web scrape some data and create a csv. The next task (dependent on the 1st tasks csv file) will run a sql server proc to dump the csv data into the database. When I run these tasks individually they work fine. But when I add a requires it gives me the error you can see in the title.
Please can you let me know what I am doing wrong?
The Luigi full error is as follows:
Runtime error: Traceback (most recent call last): File "C:\Users\somepath\Luigi\venv\python\lib\site-packages\luigi\worker.py", line 182, in run raise RuntimeError('Unfulfilled %s at run time: %s' % (deps, ', '.join(missing))) RuntimeError: Unfulfilled dependency at run time: Generate_TV_WebScraping_File_X__DataMining_Lu_8213e479cf
Apologies for the code sample below in terms of indentations etc. The formatting has been changed on pasting.
My current code is below:
import requests
from bs4 import BeautifulSoup
import re
import pyodbc
import luigi
import arrow
class Generate_TV_WebScraping_File(luigi.ExternalTask):
input_path = luigi.Parameter('X:/somefilepath/Televised_Football_Staging.csv')
def ouptut(self):
return luigi.LocalTarget(self.input_path)
def run(self):
################################ GET DATA FROM WEBSITE ###############################################
## set url
page_link = 'https://www.somewebsite.html'
## request access with timout of 5 seconds
page_response = requests.get(page_link, timeout=5)
## BS to parse the html
page_content = BeautifulSoup(page_response.content, "html.parser")
## find all content related to match fixtures div class
div_team = page_content.findAll('div', attrs={"class":"span4 matchfixture"})
clean_date = ''
## set path and file for data export
f = open("X:\somefilepath\Televised_Football_Staging.csv", "w")
## for all the content in div class 'row-fluid'
for rows in page_content.findAll('div', attrs={"class":"row-fluid"}):
## if the content div class is match date
if rows.findAll('div', attrs={"class": "span12 matchdate"}):
## save it to the variable 'date_row'
date_row = rows.findAll('div', attrs={"class": "span12 matchdate"})
## clean it by removing html tags and comma separate
concat_rows = ",".join(str(x) for x in date_row)
clean_date = re.sub("<.*?>", " ", concat_rows)
## when it is not a match date in the div class 'row-fluid' and it is the match fixture content
elif rows.findAll('div', attrs={"class": "span4 matchfixture"}):
## clean it by removing html tags and comma separate
concat_rows = ",".join(str(x) for x in rows)
clean_rows = re.sub("<.*?>", " ", concat_rows)
## print the content and concatenate with date
f.write('%s\n' % (clean_rows + "," + clean_date))
## Close csv
f.close()
#######################################################################################################
class Insert_TV_WebScraping_To_Db(luigi.Task):
def requires(self):
return Generate_TV_WebScraping_File(self.input_path)
def ouptut(self):
sys_date = arrow.now().format('YYYYMMDD')
return luigi.LocalTarget('X:/somefilepath/tv_webscrape_log_' + sys_date + '.txt')
def run(self):
############################### INSERT DATA INTO DATABASE ###################################################
## set sql connection string to DataMiningDev
cnxn = pyodbc.connect(driver="{SQL Server}", server="someserver", database="somedatabase", autocommit=True)
## run sql query
cursor = cnxn.cursor()
cursor.execute('EXEC somedatabase.someschema.somedbproc')
## being kind
cnxn.close()
#############################################################################################################
# Run Luigi Tasks #
#luigi.run(main_task_cls=Generate_TV_WebScraping_File)
luigi.run(main_task_cls=Insert_TV_WebScraping_To_Db)