I have a Scrapy spider to scrape a list of 3,000 URLs. Whether I scrape them slowly (0.8 sites per second) or a little faster (1.5 sites per second), the spider always starts failing after scraping around 727 pages, returning error 429. Funnily enough, I have shuffled the URLs and out of the 10 times I reran it, for 3 of those it scraped exactly 727 items and the remaining times around 690-730, to then start returning 429 errors.
I am using headers, random proxies and random user agents for each request. Has somebody experienced something like this?
Here is my code:
import scrapy
import json
import csv
import re
import random
class website(scrapy.Spider):
name = "website"
country_id = 'US'
custom_settings = {
'CONCURRENT_ITEMS': 40,
'CONCURRENT_REQUESTS': 20,
'CONCURRENT_REQUESTS_PER_DOMAIN': 20,
'DOWNLOAD_DELAY': 0.9,
'FEED_EXPORT_ENCODING': 'utf-8',
'RANDOMIZE_DOWNLOAD_DELAY': True,
'LOG_LEVEL': 'INFO',
'FEEDS': {
f'data/{name}.jl': {
"format": "jsonlines"
}
}
}
def __init__(self):
self.headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1"
}
with open('data/missing_website_data_urls.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
self.missing_website_data_urls = row
def start_requests(self):
for search_url in self.missing_website_data_urls:
search_url = search_url[:32] + re.sub('[^0-9a-zA-Z ]+', '', search_url[32:])
#search_url = search_url.replace('&', '').replace('$', '').replace('+', '').replace(',', '').replace('/', '').replace('?', '').replace('=', '').replace('@', '').replace(':', '').replace(';', '')
yield scrapy.Request(url=search_url,
method='GET',
headers=self.headers,
callback=self.parse,
cb_kwargs={'url': search_url},
meta={'proxy_country': self.country_id
}
)