I have the following scrapy CrawlSpider
:
import logger as lg
from scrapy.crawler import CrawlerProcess
from scrapy.http import Response
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashTextResponse
from urllib.parse import urlencode
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
logger = lg.get_logger("oddsportal_spider")
class SeleniumScraper(CrawlSpider):
name = "splash"
custom_settings = {
"USER_AGENT": "*",
"LOG_LEVEL": "WARNING",
"DOWNLOADER_MIDDLEWARES": {
'scraper_scrapy.odds.middlewares.SeleniumMiddleware': 543,
},
}
httperror_allowed_codes = [301]
start_urls = ["https://www.oddsportal.com/tennis/results/"]
rules = (
Rule(
LinkExtractor(allow="/atp-buenos-aires/results/"),
callback="parse_tournament",
follow=True,
),
Rule(
LinkExtractor(
allow="/tennis/",
restrict_xpaths=("//td[@class='name table-participant']//a"),
),
callback="parse_match",
),
)
def parse_tournament(self, response: Response):
logger.info(f"Parsing tournament - {response.url}")
def parse_match(self, response: Response):
logger.info(f"Parsing match - {response.url}")
process = CrawlerProcess()
process.crawl(SeleniumScraper)
process.start()
The Selenium middleware is as follows:
class SeleniumMiddleware:
@classmethod
def from_crawler(cls, crawler):
middleware = cls()
crawler.signals.connect(middleware.spider_opened, signals.spider_opened)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
logger.debug(f"Selenium processing request - {request.url}")
self.driver.get(request.url)
return HtmlResponse(
request.url,
body=self.driver.page_source,
encoding='utf-8',
request=request,
)
def spider_opened(self, spider):
options = webdriver.FirefoxOptions()
options.add_argument("--headless")
self.driver = webdriver.Firefox(
options=options,
executable_path=Path("/opt/geckodriver/geckodriver"),
)
def spider_closed(self, spider):
self.driver.close()
End to end this takes around a minute for around 50ish pages. To try and speed things up and take advantage of multiple threads and Javascript I've implemented the following scrapy_splash spider:
class SplashScraper(CrawlSpider):
name = "splash"
custom_settings = {
"USER_AGENT": "*",
"LOG_LEVEL": "WARNING",
"SPLASH_URL": "http://localhost:8050",
"DOWNLOADER_MIDDLEWARES": {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
"SPIDER_MIDDLEWARES": {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100},
"DUPEFILTER_CLASS": 'scrapy_splash.SplashAwareDupeFilter',
"HTTPCACHE_STORAGE": 'scrapy_splash.SplashAwareFSCacheStorage',
}
httperror_allowed_codes = [301]
start_urls = ["https://www.oddsportal.com/tennis/results/"]
rules = (
Rule(
LinkExtractor(allow="/atp-buenos-aires/results/"),
callback="parse_tournament",
process_request="use_splash",
follow=True,
),
Rule(
LinkExtractor(
allow="/tennis/",
restrict_xpaths=("//td[@class='name table-participant']//a"),
),
callback="parse_match",
process_request="use_splash",
),
)
def process_links(self, links):
for link in links:
link.url = "http://localhost:8050/render.html?" + urlencode({'url' : link.url})
return links
def _requests_to_follow(self, response):
if not isinstance(response, (HtmlResponse, SplashTextResponse)):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
yield rule.process_request(request, response)
def use_splash(self, request, response):
request.meta.update(splash={'endpoint': 'render.html'})
return request
def parse_tournament(self, response: Response):
logger.info(f"Parsing tournament - {response.url}")
def parse_match(self, response: Response):
logger.info(f"Parsing match - {response.url}")
However, this takes about the same amount of time. I was hoping to see a big increase in speed :(
I've tried playing around with different DOWNLOAD_DELAY
settings but that hasn't made things any faster.
All the concurrency settings are left at their defaults.
Any ideas on if/how I'm going wrong?