I'm trying to use python's Scrapy library with IBM cloud functions. I want to pass some arguments with process.crawl
. How can I do that?
My code is as follows:
class MySpider(scrapy.Spider):
name = "quotes"
start_urls = ["http://quotes.toscrape.com/"]
def __init__(self, make=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
init_url = "http://quotes.toscrape.com/"
self.start_urls = [init_url]
def parse(self, response):
title = response.css(".header-box > div a::text").extract_first()
yield {"title": title}
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(MySpider) <-------- Explanation
process.start()
Explanation
I found here that it can be done as follows:
process.crawl(MySpider, make="Audi")
But when I try to do that I get an error in my editor:
expected type 'dict' got 'str' instead
What am I doing wrong?
UPDATE
I use the scrapy spider for IBM cloud functions, thus my code is as follows:
import scrapy
from scrapy.crawler import CrawlerProcess
class MySpider(scrapy.Spider):
name = "quotes"
start_urls = ["http://quotes.toscrape.com/"]
def __init__(self, make=None, *args, **kwargs):
super(MySpider, self).__init__(*args, **kwargs)
print("Make {}".format(make))
def parse(self, response):
title = response.css(".header-box > div a::text").extract_first()
yield {"title": title}
def main(params):
process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(MySpider, make="Audi") <------- in my editor I get here an warning expected type 'dict' got 'str' instead
process.start()
return {"joke": "Some shit joke"}
And when I run main({})
from the console I get following error:
2018-06-22 08:42:45 [scrapy.extensions.telnet] DEBUG: Telnet console listening on 127.0.0.1:6024 Traceback (most recent call last): File "", line 1, in File "./main.py", line 30, in main File "/Users/boris/Projects/IBM-cloud/virtualenv/lib/python3.6/site-packages/scrapy/crawler.py", line 291, in start reactor.run(installSignalHandlers=False) # blocking call File "/Users/boris/Projects/IBM-cloud/virtualenv/lib/python3.6/site-packages/twisted/internet/base.py", line 1260, in run self.startRunning(installSignalHandlers=installSignalHandlers) File "/Users/boris/Projects/IBM-cloud/virtualenv/lib/python3.6/site-packages/twisted/internet/base.py", line 1240, in startRunning ReactorBase.startRunning(self) File "/Users/boris/Projects/IBM-cloud/virtualenv/lib/python3.6/site-packages/twisted/internet/base.py", line 748, in startRunning raise error.ReactorNotRestartable() twisted.internet.error.ReactorNotRestartable