I'm trying to adapt Scrapy's RetryMiddleware class, overriding the _retry
method with a copy-pasted version in which I just add one additional line. I tried starting my custom middleware module as follows:
import scrapy.downloadermiddlewares.retry
from scrapy.utils.python import global_object_name
However, this gives rise to an
ImportError: cannot import name global_object_name
According to ImportError: Cannot import name X, this type of error is caused by circular imports, but in this case I cannot easily remove dependencies in Scrapy's source code. How can I fix this?
For the sake of completeness, here is the TorRetryMiddleware
I'm trying to implement:
import logging
import scrapy.downloadermiddlewares.retry
from scrapy.utils.python import global_object_name
import apkmirror_scraper.tor_controller as tor_controller
logger = logging.getLogger(__name__)
class TorRetryMiddleware(scrapy.downloadermiddlewares.retry.RetryMiddleware):
def __init__(self, settings):
super(TorRetryMiddleware, self).__init__(settings)
self.retry_http_codes = {403, 429} # Retry on 403 ('Forbidden') and 429 ('Too Many Requests')
def _retry(self, request, reason, spider):
'''Same as original '_retry' method, but with a call to 'change_identity' before returning the Request.'''
retries = request.meta.get('retry_times', 0) + 1
stats = spider.crawler.stats
if retries <= self.max_retry_times:
logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})
retryreq = request.copy()
retryreq.meta['retry_times'] = retries
retryreq.dont_filter = True
retryreq.priority = request.priority + self.priority_adjust
if isinstance(reason, Exception):
reason = global_object_name(reason.__class__)
stats.inc_value('retry/count')
stats.inc_value('retry/reason_count/%s' % reason)
tor_controller.change_identity() # This line is added to the original '_retry' method
return retryreq
else:
stats.inc_value('retry/max_reached')
logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
{'request': request, 'retries': retries, 'reason': reason},
extra={'spider': spider})