I've made a scrapy project that contains multiple spiders in a file and need the interpreter to be able distinguish which pipeline is associated with which spider, similar to the person who asked this SO question. Using the solution provided by the top voted answer, I put the decorator within the pipeline classes and then defined the pipelines lists within the spiders themselves. But when I run this, I get a name error because the pipelines are defined in the spider file.
Since the pipelines.py file isn't a module you can't really import it into the spiders.py file. I'm not sure if the answer posted is still relevant since its not recent, but it seems like it did work properly for someone so it is worth a try at least. Btw, I'm running the two spiders I have sequentially based on the code provided in the docs and even though both spiders run when I use the command: scrapy runspider, I think the pipeline classes are not being called. However, when I run each spider individually, the tables are filled properly. I also included both pipeline classes in the settings.py dicionary. So from this, I have a few questions:
[1.] Do I have the correct set up of both files based on the answer that was provided in the question?
[2.] If so, how would I properly connect the namespaces of the two files?
[3.] Is there a better way to do this besides just creating separate projects?
I have the code for both files below, any help will be appreciated, thanks.
pipelines.py
from sqlalchemy.orm import sessionmaker
from models import Tickets, Tickets3, db_connect, create_vs_tickets_table, create_tc_tickets_table
class ComparatorPipeline(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_vs_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
def check_spider_pipeline(process_item_method):
@functools.wraps(process_item_method)
def wrapper(self, item, spider):
#message template for debugging
msg = '%%s %s pipeline step' % (self.__.class__.__name__,)
#if class is in the spider's pipeline, then use the
#process_item method normally.
if self.__class__ in spider.pipeline:
spider.log(msg % 'executing', level=log.DEBUG)
return process_item_method(self, item, spider)
#otherwise, just return the untouched item (skip this step in the pipeline)
else:
spider.log(msg % 'skipping', level= log.DEBUG)
return item
return wrapper
if spider.name == "comparator":
session = self.Session()
ticket = Tickets(**item)
try:
session.add(ticket)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
class ComparatorPipeline2(object):
"""Price comparison pipeline for storing scraped items in the database"""
def __init__(self):
"""
Initializes database connection and sessionmaker.
Creates deals table.
"""
engine = db_connect()
create_tc_tickets_table(engine)
self.Session = sessionmaker(bind=engine)
def process_item(self, item, spider):
"""Save tickets in the database.
This method is called for every item pipeline component.
"""
def check_spider_pipeline(process_item_method):
@functools.wraps(process_item_method)
def wrapper(self, item, spider):
#message template for debugging
msg = '%%s %s pipeline step' % (self.__.class__.__name__,)
#if class is in the spider's pipeline, then use the
#process_item method normally.
if self.__class__ in spider.pipeline:
spider.log(msg % 'executing', level=log.DEBUG)
return process_item_method(self, item, spider)
#otherwise, just return the untouched item (skip this step in the pipeline)
else:
spider.log(msg % 'skipping', level= log.DEBUG)
return item
return wrapper
if spider.name == "comparator2":
session = self.Session()
ticket2 = Tickets2(**item)
try:
session.add(ticket2)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
return item
spider class definitions
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from concert_comparator.items import ComparatorItem, ComparatorItem3
from urlparse import urljoin
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
bandname = raw_input("Enter a bandname \n")
vs_url = "http://www.vividseats.com/concerts/" + bandname + "-tickets.html"
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider(CrawlSpider):
pipeline = set([
ComparatorPipeline
])
pipeline = ['first']
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.vividseats.com"]
start_urls = [vs_url]
tickets_list_xpath = './/*[@itemtype="http://schema.org/Event"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('tickets')
price_list = [i.get('p') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
loader = response.meta['loader']
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "http://www.vividseats.com/javascript/tickets.shtml?productionId=" + json_id
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/*[@class="productionsEvent"]/text()')
loader.add_xpath('eventLocation' , './/*[@class = "productionsVenue"]/span[@itemprop = "name"]/text()')
loader.add_xpath('ticketsLink' , './/*/a[@class = "btn btn-primary"]/@href')
loader.add_xpath('eventDate' , './/*[@class = "productionsDate"]/text()')
loader.add_xpath('eventCity' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressLocality"]/text()')
loader.add_xpath('eventState' , './/*[@class = "productionsVenue"]/span[@itemprop = "address"]/span[@itemprop = "addressRegion"]/text()')
loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "concerts/" + bandname + "-tickets/" + bandname + "-" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
class MySpider3(CrawlSpider):
pipeline = set([
ComparatorPipeline2
])
handle_httpstatus_list = [416]
name = 'comparator3'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
configure_logging()
runner = CrawlerRunner()
@defer.inlineCallbacks
def crawl():
yield runner.crawl(MySpider)
yield runner.crawl(MySpider3)
reactor.stop()
crawl()
reactor.run()