I have written a scrapy spider that I'm running inside of a django celery task. When I run the task using the command: python manage.py celery worker --loglevel=info from this tutorial the task runs in the terminal and it seems that the scrapy log begins to start but soon after the log begins to come up on the screen, it seems that the celery script takes over the terminal window. I'm still new to using celery so I can't tell what is happening to the task. Here is the code for the task.py script and the spider file (with code I got from an SO post)
tasks.py
from celery.registry import tasks
from celery.task import Task
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from django.core.mail import EmailMultiAlternatives
from ticket_city_scraper.ticket_city_scraper.spiders.tc_spider import spiderCrawl
from celery import shared_task
@shared_task
def crawl():
return spiderCrawl()
spider file (with relevant code at the bottom)
import scrapy
import re
import json
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from scrapy.contrib.spiders import CrawlSpider , Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import Join, MapCompose
from comparison.ticket_city_scraper.ticket_city_scraper.items import ComparatorItem
from urlparse import urljoin
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from twisted.internet import reactor, defer
from scrapy.utils.log import configure_logging
from billiard import Process
bandname = raw_input("Enter bandname\n")
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
class MySpider3(CrawlSpider):
handle_httpstatus_list = [416]
name = 'comparator'
allowed_domains = ["www.ticketcity.com"]
start_urls = [tc_url]
tickets_list_xpath = './/div[@class = "vevent"]'
def create_link(self, bandname):
tc_url = "https://www.ticketcity.com/concerts/" + bandname + "-tickets.html"
self.start_urls = [tc_url]
#return tc_url
tickets_list_xpath = './/div[@class = "vevent"]'
def parse_json(self, response):
loader = response.meta['loader']
jsonresponse = json.loads(response.body_as_unicode())
ticket_info = jsonresponse.get('B')
price_list = [i.get('P') for i in ticket_info]
if len(price_list) > 0:
str_Price = str(price_list[0])
ticketPrice = unicode(str_Price, "utf-8")
loader.add_value('ticketPrice', ticketPrice)
else:
ticketPrice = unicode("sold out", "utf-8")
loader.add_value('ticketPrice', ticketPrice)
return loader.load_item()
def parse_price(self, response):
print "parse price function entered \n"
loader = response.meta['loader']
event_City = response.xpath('.//span[@itemprop="addressLocality"]/text()').extract()
eventCity = ''.join(event_City)
loader.add_value('eventCity' , eventCity)
event_State = response.xpath('.//span[@itemprop="addressRegion"]/text()').extract()
eventState = ''.join(event_State)
loader.add_value('eventState' , eventState)
event_Date = response.xpath('.//span[@class="event_datetime"]/text()').extract()
eventDate = ''.join(event_Date)
loader.add_value('eventDate' , eventDate)
ticketsLink = loader.get_output_value("ticketsLink")
json_id_list= re.findall(r"(\d+)[^-]*$", ticketsLink)
json_id= "".join(json_id_list)
json_url = "https://www.ticketcity.com/Catalog/public/v1/events/" + json_id + "/ticketblocks?P=0,99999999&q=0&per_page=250&page=1&sort=p.asc&f.t=s&_=1436642392938"
yield scrapy.Request(json_url, meta={'loader': loader}, callback = self.parse_json, dont_filter = True)
def parse(self, response):
"""
# """
selector = HtmlXPathSelector(response)
# iterate over tickets
for ticket in selector.select(self.tickets_list_xpath):
loader = XPathItemLoader(ComparatorItem(), selector=ticket)
# define loader
loader.default_input_processor = MapCompose(unicode.strip)
loader.default_output_processor = Join()
# iterate over fields and add xpaths to the loader
loader.add_xpath('eventName' , './/span[@class="summary listingEventName"]/text()')
loader.add_xpath('eventLocation' , './/div[@class="divVenue location"]/text()')
loader.add_xpath('ticketsLink' , './/a[@class="divEventDetails url"]/@href')
#loader.add_xpath('eventDateTime' , '//div[@id="divEventDate"]/@title') #datetime type
#loader.add_xpath('eventTime' , './/*[@class = "productionsTime"]/text()')
print "Here is ticket link \n" + loader.get_output_value("ticketsLink")
#sel.xpath("//span[@id='PractitionerDetails1_Label4']/text()").extract()
ticketsURL = "https://www.ticketcity.com/" + loader.get_output_value("ticketsLink")
ticketsURL = urljoin(response.url, ticketsURL)
yield scrapy.Request(ticketsURL, meta={'loader': loader}, callback = self.parse_price, dont_filter = True)
#Code to run spider from celery task script
class UrlCrawlerScript(Process):
def __init__(self, spider):
Process.__init__(self)
settings = get_project_settings()
self.crawler = Crawler(settings)
self.crawler.configure()
self.crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
self.spider = spider
def run(self):
self.crawler.crawl(self.spider)
self.crawler.start()
reactor.run()
def spiderCrawl():
# settings = get_project_settings()
# settings.set('USER_AGENT','Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)')
# process = CrawlerProcess(settings)
# process.crawl(MySpider3)
# process.start()
spider = MySpider()
crawler = UrlCrawlerScript(spider)
crawler.start()
crawler.join()
I'm trying to make the code such that the user can enter text into a form which will then be concatenated to a url but for now I'm using raw_input to get the user's input. Is there something that needs to be added to the code in order for the task to run completely? Any help/code would be appreciated, thanks.
EDIT:
Terminal Window after running the command
(trydjango18)elijah@elijah-VirtualBox:~/Desktop/trydjango18/src2/trydjango18$ python manage.py celery worker --loglevel=info
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/django/core/management/base.py:259: RemovedInDjango19Warning: "requires_model_validation" is deprecated in favor of "requires_system_checks".
RemovedInDjango19Warning)
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning:
The 'BROKER_VHOST' setting is scheduled for deprecation in version 2.5 and removal in version v4.0. Use the BROKER_URL setting instead
alternative='Use the {0.alt} instead'.format(opt))
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning:
The 'BROKER_HOST' setting is scheduled for deprecation in version 2.5 and removal in version v4.0. Use the BROKER_URL setting instead
alternative='Use the {0.alt} instead'.format(opt))
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning:
The 'BROKER_USER' setting is scheduled for deprecation in version 2.5 and removal in version v4.0. Use the BROKER_URL setting instead
alternative='Use the {0.alt} instead'.format(opt))
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning:
The 'BROKER_PASSWORD' setting is scheduled for deprecation in version 2.5 and removal in version v4.0. Use the BROKER_URL setting instead
alternative='Use the {0.alt} instead'.format(opt))
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/app/defaults.py:251: CPendingDeprecationWarning:
The 'BROKER_PORT' setting is scheduled for deprecation in version 2.5 and removal in version v4.0. Use the BROKER_URL setting instead
alternative='Use the {0.alt} instead'.format(opt))
/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:6: ScrapyDeprecationWarning: Module `scrapy.contrib.spiders` is deprecated, use `scrapy.spiders` instead
from scrapy.contrib.spiders import CrawlSpider , Rule
/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:9: ScrapyDeprecationWarning: Module `scrapy.contrib.loader` is deprecated, use `scrapy.loader` instead
from scrapy.contrib.loader import ItemLoader
/home/elijah/Desktop/trydjango18/src2/trydjango18/comparison/ticket_city_scraper/ticket_city_scraper/spiders/tc_spider.py:11: ScrapyDeprecationWarning: Module `scrapy.contrib.loader.processor` is deprecated, use `scrapy.loader.processors` instead
from scrapy.contrib.loader.processor import Join, MapCompose
Enter bandname
awolnation
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/apps/worker.py:161: CDeprecationWarning:
Starting from version 3.2 Celery will refuse to accept pickle by default.
The pickle serializer is a security concern as it may give attackers
the ability to execute any command. It's important to secure
your broker from unauthorized access when using pickle, so we think
that enabling pickle should require a deliberate action and not be
the default choice.
If you depend on pickle then you should set a setting to disable this
warning and to be sure that everything will continue working
when you upgrade to Celery 3.2::
CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
You must only enable the serializers that you will actually use.
warnings.warn(CDeprecationWarning(W_PICKLE_DEPRECATED))
[2015-08-05 18:15:22,915: WARNING/MainProcess] /home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/celery/apps/worker.py:161: CDeprecationWarning:
Starting from version 3.2 Celery will refuse to accept pickle by default.
The pickle serializer is a security concern as it may give attackers
the ability to execute any command. It's important to secure
your broker from unauthorized access when using pickle, so we think
that enabling pickle should require a deliberate action and not be
the default choice.
If you depend on pickle then you should set a setting to disable this
warning and to be sure that everything will continue working
when you upgrade to Celery 3.2::
CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
You must only enable the serializers that you will actually use.
warnings.warn(CDeprecationWarning(W_PICKLE_DEPRECATED))
-------------- celery@elijah-VirtualBox v3.1.18 (Cipater)
---- **** -----
--- * *** * -- Linux-3.13.0-54-generic-x86_64-with-Ubuntu-14.04-trusty
-- * - **** ---
- ** ---------- [config]
- ** ---------- .> app: default:0x7f6ce3b3e410 (djcelery.loaders.DjangoLoader)
- ** ---------- .> transport: amqp://guest:**@localhost:5672//
- ** ---------- .> results: database
- *** --- * --- .> concurrency: 2 (prefork)
-- ******* ----
--- ***** ----- [queues]
-------------- .> celery exchange=celery(direct) key=celery
[tasks]
. comparison.tasks.crawl
[2015-08-05 18:15:23,178: INFO/MainProcess] Connected to amqp://guest:**@127.0.0.1:5672//
[2015-08-05 18:15:23,276: INFO/MainProcess] mingle: searching for neighbors
[2015-08-05 18:15:24,322: INFO/MainProcess] mingle: all alone
/home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/djcelery/loaders.py:136: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
warn('Using settings.DEBUG leads to a memory leak, never '
[2015-08-05 18:15:24,403: WARNING/MainProcess] /home/elijah/Desktop/trydjango18/trydjango18/local/lib/python2.7/site-packages/djcelery/loaders.py:136: UserWarning: Using settings.DEBUG leads to a memory leak, never use this setting in production environments!
warn('Using settings.DEBUG leads to a memory leak, never '
[2015-08-05 18:15:24,404: WARNING/MainProcess] celery@elijah-VirtualBox ready.