I have a function (check_duplicates()) in the spider that checks for the presence of a url in my database, and in case of absence passes the url on to the parse_product method:
def check_duplicates(url):
connection = mysql.connector.connect(
host='host_ip',
port=3306,
user='username',
password='pass',
database='base_name',
)
cursor = connection.cursor()
sqlq = f"SELECT url FROM my_table WHERE url = '{url}'"
cursor.execute(sqlq)
results = cursor.fetchone()
return results
class CianSpider(scrapy.Spider):
name = 'spider_name'
def start_requests(self):
url = 'https://some_site.ru'
yield Request(
url=url,
method='GET',
callback=self.parse)
def parse(self, response, **cb_kwargs):
for item in response.css('a[href*=".item"]::attr("href")').extract():
url = response.urljoin(item)
if check_duplicates(url) is None:
yield scrapy.Request(
url=url,
cookies=self.cookies,
callback=self.parse_product,
)
def parse_product(self, response, **cb_kwargs):
pass
How do I implement this mechanism using Scrapy spider middleware (how and where should I register the url verification function)?