Data are overwritten and they give the data of only last page how to solve these problem is any solution for these kindly recommend me I've seen several solutions to scrape multiple pages from a website, but couldn't make it work on my code
import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
url = 'https://advpalata.vrn.ru/registers/reestr_lawyers/'
for x in range(0,5):
payload='p='+str(x)+'&letterfilter=%D0%90'
headers = {
'authority': 'advpalata.vrn.ru',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-US,en;q=0.9',
'cache-control': 'max-age=0',
'content-type': 'application/x-www-form-urlencoded',
'cookie': 'PHPSESSID=546743b283bb9e3b2e78dabbfb894220; ie=yes; stat_id=546743b283bb9e3b2e78dabbfb894220; _ym_uid=1658939896610936176; _ym_d=1658939896; _ym_isad=2; PHPSESSID=546743b283bb9e3b2e78dabbfb894220',
'origin': 'https://advpalata.vrn.ru',
'referer': 'https://advpalata.vrn.ru/registers/reestr_lawyers/',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-origin',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.url,
method='POST',
body=self.payload,
headers=self.headers,
callback=self.parse_item,
)
def parse_item(self, response):
books = response.xpath("//td[@class='name']//a//@href").extract()
for book in books:
absolute_url = response.urljoin(book)
yield Request(absolute_url, callback=self.parse_book)
def parse_book(self, response):
title=response.css("h3::text").get()
# phone = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=phone']//text()").get()
# email = response.xpath("//div[@class='advocate-right']//p['@umi:field-name=email']//a//text()").get()
yield{
'title':title,
# 'phone':phone,
# 'email':email
}