I've developed a Scrapy Splash scraper using Visual Studio Code on Windows 10.
When I run my scraper like so without a runner.py
file it works and generates the scraped content int "out.json": scrapy crawl mytest -o out.json
However, when I run the scraper (either in debug mode in Visual Studio code with this runner.py
file it fails on the execute
line (full code below):
Exception has occurred: ReactorNotRestartable
exception: no description
File "C:\scrapy\hw_spiders\spiders\runner.py", line 8, in <module>
execute(
I already checked here:
- Scrapy - Reactor not Restartable
- Scrapy raises ReactorNotRestartable when CrawlerProcess is ran twice
- ReactorNotRestartable error in while loop with scrapy
From those posts, it seems an issue if I start the 2nd scraper (e.g. "call crawl multiple times and start only once"), however, I don't see where I supposedly do that.
I also read there that there are potential issues with while
loops and the Twisted reactor
, but those too I don't see in my code.
So I'm now lost as to where I need to fix my code.
runner.py
#https://newbedev.com/debugging-scrapy-project-in-visual-studio-code
import os
from scrapy.cmdline import execute
os.chdir(os.path.dirname(os.path.realpath(__file__)))
try:
execute(
[
'scrapy',
'crawl',
'mytest',
'-o',
'out.json',
]
)
except SystemExit:
pass
launch.json
{
"version": "0.1.0",
"configurations": [
{
"name": "Python: Launch Scrapy Spider",
"type": "python",
"request": "launch",
"module": "scrapy",
"args": [
"runspider",
"${file}"
],
"console": "integratedTerminal"
}
]
}
settings.json
{
"python.analysis.extraPaths": [
"./hw_spiders"
]
}
middlewares.py
from scrapy import signals
from itemadapter import is_item, ItemAdapter
class MySpiderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MyDownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py
from itemadapter import ItemAdapter
class MyPipeline:
def process_item(self, item, spider):
return item
settings.py
BOT_NAME = 'hw_spiders'
SPIDER_MODULES = ['hw_spiders.spiders']
NEWSPIDER_MODULE = 'hw_spiders.spiders'
ROBOTSTXT_OBEY = True
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
# 'hw_spiders.middlewares.MySpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
# 'hw_spiders.middlewares.MyDownloaderMiddleware': 543,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500,
}
SPLASH_URL = 'http://localhost:8050/'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
ROBOTSTXT_OBEY = False
mytest.py
import json
import re
import os
import scrapy
import time
from scrapy_splash import SplashRequest
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from ..myitems import CarItem
class MyTest_Spider(scrapy.Spider):
name = 'mytest'
start_urls = ['<hidden>']
def start_requests(self):
yield SplashRequest(
self.start_urls[0], self.parse
)
def parse(self, response):
object_links = response.css('div.wrapper div.inner33 > a::attr(href)').getall()
for link in object_links:
yield scrapy.Request(link, self.parse_object)
next_page = response.css('div.nav-links a.next.page-numbers::attr(href)').get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse)
def parse_object(self, response):
item = RentalItem()
item['url'] = response.url
object_features = response.css('table.info tr')
for feature in object_features:
try:
feature_title = feature.css('th::text').get().strip()
feature_info = feature.css('td::text').get().strip()
except:
continue
item['thumbnails'] = response.css("ul#objects li a img::attr(src)").getall()
UPDATE 1
So I now removed runner.py from my project and just have the .vscode\launch.json:
When I open my file mytest.py
in Visual Studio Code and hit F5 to debug, I see the following output:
Windows PowerShell
Copyright (C) Microsoft Corporation. All rights reserved.
Try the new cross-platform PowerShell https://aka.ms/pscore6
PS C:\scrapy\hw_spiders> & 'C:\Users\Adam\AppData\Local\Programs\Python\Python38-32\python.exe' 'c:\Users\Adam\.vscode\extensions\ms-python.python-2021.11.1422169775\pythonFiles\lib\python\debugpy\launcher' '51812' '--' '-m' 'scrapy' 'runspider' 'c:\scrapy\hw_spiders\spiders\mytest.py'
2021-11-19 14:19:02 [scrapy.utils.log] INFO: Scrapy 2.3.0 started (bot: hw_spiders)
2021-11-19 14:19:02 [scrapy.utils.log] INFO: Versions: lxml 4.5.2.0, libxml2 2.9.5, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.5 (tags/v3.8.5:580fbb0, Jul 20 2020,
15:43:08) [MSC v.1926 32 bit (Intel)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g 21 Apr 2020), cryptography 3.0, Platform Windows-10-10.0.19041-SP0
2021-11-19 14:19:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
Usage
=====
scrapy runspider [options] <spider_file>
runspider: error: Unable to load 'c:\\scrapy\\hw_spiders\\spiders\\mytest.py': attempted relative import with no known parent package
This must be line from ..myitems import RentalItem
, but I don't know why this fails.