I am trying to build a crawler using scrapy and selenium webdriver. I am trying to get a set of urls in parse()
and pass it to a callback function parse_url()
which again gets a different set of urls and passes it to parse_data()
The first callback to parse_url
works but the second to parse_data
gives an AssertionError
i.e if I run without parse_data
it prints a list of urls. But if I include it I get an assertion error
I have something like this
class mySpider(scrapy.Spider):
name = "mySpider"
allowed_domains = ["example.com"]
start_urls = [
"http://www.example.com/url",
]
def parse(self, response):
driver = webdriver.firefox()
driver.get(response.url)
urls = get_urls(driver.page_source) # get_url returns a list
yield scrapy.Request(urls, callback=self.parse_url(urls, driver))
def parse_url(self, url, driver):
url_list = []
for i in urls:
driver.get(i)
url_list.append( get_urls(driver.pagesource)) # gets some more urls
yeild scrapy.Request(urls, callback=self.parse_data(url_list, driver))
def parse_data(self, url_list, driver):
data = get_data(driver.pagesource)
This is the traceback,
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/spidermw.py", line 48, in process_spider_input
return scrape_func(response, request, spider)
File "/usr/local/lib/python2.7/dist-packages/scrapy/core/scraper.py", line 145, in call_spider
dfd.addCallbacks(request.callback or spider.parse, request.errback)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 299, in addCallbacks
assert callable(callback)
AssertionError