I'm fairly new to web scraping, and am just testing it out on a few web pages. I've successfully scraped several Amazon searches, however in this case I get a 301 redirect, causing a different page to be scraped.
I've tried adding a line (handle_httpstatus_list = [301]) to prevent the redirect. This causes no data to be scraped at all.
On reading the documentation for scrapy, I thought perhaps editing the middlewares could solve this problem? However, was still unsure about how to go about doing this.
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
handle_httpstatus_list = [301]
start_urls = ['https://www.amazon.com/s?i=stripbooks&rh=n%3A2%2Cp_30%3AIndependently+published%2Cp_n_feature_browse-bin%3A2656022011&s=daterank&Adv-Srch-Books-Submit.x=50&Adv-Srch-Books-Submit.y=10&field-datemod=8&field-dateop=During&field-dateyear=2019&unfiltered=1&ref=sr_adv_b']
def parse(self, response):
SET_SELECTOR = '.s-result-item'
for car in response.css(SET_SELECTOR):
NAME = '.a-size-medium ::text'
TITLE = './/h2/a/span/text()'
LINK = './/h2/a/@href'
yield {
'name': car.css(NAME).extract(),
'title': car.xpath(TITLE).extract(),
'link': car.xpath(LINK).get()
}
NEXT_PAGE_SELECTOR = '.a-last a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
next_page = response.urljoin(next_page)
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)