I have adapted the code from Using Middleware to ignore duplicates in Scrapy.
from scrapy.exceptions import DropItem
from scrapy import log
import os.path
class IgnoreDuplicates():
def __init__(self):
self._cu_file = open("crawled_urls.txt", "a+")
self._crawled_urls = set([line.strip() for line in self._cu_file.readlines()])
def process_request(self, request, spider):
if request.url in self._crawled_urls:
raise DropItem("Duplicate product scrape caught by IgnoreDuplicates at <%s>" % (url))
else:
self._crawled_urls.add(request.url)
self._cu_file.write(request.url + '\n')
log.msg("IgnoreDuplicates recorded this url " + request.url, level=log.DEBUG)
return None
I have also added the middleware module to the settings.py:
SPIDER_MANAGER_CLASS = 'slybot.spidermanager.SlybotSpiderManager'
EXTENSIONS = {'slybot.closespider.SlybotCloseSpider': 1}
ITEM_PIPELINES = {'slybot.dupefilter.DupeFilterPipeline': 1}
SPIDER_MIDDLEWARES = {'slybot.middleware.IgnoreDuplicates': 500, 'slybot.spiderlets.SpiderletsMiddleware': 999} # as close as possible to spider output
PLUGINS = ['slybot.plugins.scrapely_annotations.Annotations']
SLYDUPEFILTER_ENABLED = True
PROJECT_DIR = 'slybot-project'
FEED_EXPORTERS = {
'csv': 'slybot.exporter.SlybotCSVItemExporter',
}
CSV_EXPORT_FIELDS = None
try:
from local_slybot_settings import *
except ImportError:
pass
The process_request function does not get called. I've tried changing the value for the middleware key in settings.py so it is executed before and after the SpiderletsMiddleware. But the exception and the log message do not show up in the output.
How do I make sure the middleware is called?