So I'm making a couple of scrapers and now I'm trying to make a script that runs the corresponding spiders with URLs collected from a DB but I can't find a way to do this.
I have this in my spider:
class ElCorteIngles(scrapy.Spider):
name = 'ElCorteIngles'
url = ''
DEBUG = False
def start_requests(self):
if self.url != '':
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
# Get product name
try:
self.p_name = response.xpath('//*[@id="product-info"]/h2[1]/a/text()').get()
except:
print(f'{CERROR} Problem while getting product name from website - {self.name}')
# Get product price
try:
self.price_no_cent = response.xpath('//*[@id="price-container"]/div/span[2]/text()').get()
self.cent = response.xpath('//*[@id="price-container"]/div/span[2]/span[1]/text()').get()
self.currency = response.xpath('//*[@id="price-container"]/div/span[2]/span[2]/text()').get()
if self.currency == None:
self.currency = response.xpath('//*[@id="price-container"]/div/span[2]/span[1]/text()').get()
self.cent = None
except:
print(f'{CERROR} Problem while getting product price from website - {self.name}')
# Join self.price_no_cent with self.cent
try:
if self.cent != None:
self.price = str(self.price_no_cent) + str(self.cent)
self.price = self.price.replace(',', '.')
else:
self.price = self.price_no_cent
except:
print(f'{ERROR} Problem while joining price with cents - {self.name}')
# Return data
if self.DEBUG == True:
print([self.p_name, self.price, self.currency])
data_collected = ShopScrapersItems()
data_collected['url'] = response.url
data_collected['p_name'] = self.p_name
data_collected['price'] = self.price
data_collected['currency'] = self.currency
yield data_collected
Normally when I run the spider from the console I do:
scrapy crawl ElCorteIngles -a url='https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-braun-senso-smart-5-5500/'
and now I need a way to do the same on a external script and get the output yield data_collected
What I currently have in my external script is this:
import scrapy
from scrapy.crawler import CrawlerProcess
import sqlalchemy as db
# Import internal libraries
from Ruby.Ruby.spiders import *
# Variables
engine = db.create_engine('mysql+pymysql://DATABASE_INFO')
class Worker(object):
def __init__(self):
self.crawler = CrawlerProcess({})
def scrape_new_links(self):
conn = engine.connect()
# Get all new links from DB and scrape them
query = 'SELECT * FROM Ruby.New_links'
result = conn.execute(query)
for x in result:
telegram_id = x[1]
email = x[2]
phone_number = x[3]
url = x[4]
spider = x[5]
# In this cade the spider will be ElCorteIngles and
# the url https://www.elcorteingles.pt/electrodomesticos/A26601428-depiladora-
# braun-senso-smart-5-5500/'
self.crawler.crawl(spider, url=url)
self.crawler.start()
Worker().scrape_new_links()
I also don't know if doing url=url
in self.crawler.crawl()
is the proper way to give the URL to the spider but let me know what you think.
All data from yield
is being returned by a pipeline.
I think there is no need for extra info but if you need any just let me know!