I am trying to scrape the website of lacentrale.fr thanks to scrapy, but even if I rotate my users agent and IP address (thanks to TOR), the web site detect my robot and send me false values. Please can you check my code used in middlwares and setting and tell me if something went wrong.
code in middlewares :
from tutorial.settings import * #USER_AGENT_LIST
import random
from stem.control import Controller
from toripchanger import TorIpChanger
from stem import Signal
class RandomUserAgentMiddleware(object):
def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
if ua:
request.headers.setdefault('User-Agent', ua)
def _set_new_ip():
with Controller.from_port(port=9051) as controller:
controller.authenticate(password='')
controller.signal(Signal.NEWNYM)
ip_changer = TorIpChanger(reuse_threshold=10)
class ProxyMiddleware(object):
_requests_count = 0
def process_request(self, request, spider):
self._requests_count += 1
if self._requests_count > 10:
self._requests_count = 0
ip_changer.get_new_ip()
print("New Tor connection processed")
request.meta['proxy'] = 'http://127.0.0.1:8118'
spider.log('Proxy : %s' % request.meta['proxy'])
Code used in settings :
BOT_NAME = 'tutorial'
SPIDER_MODULES = ['tutorial.spiders']
NEWSPIDER_MODULE = 'tutorial.spiders'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = True
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'tutorial.middlewares.RandomUserAgentMiddleware': 400,
'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware':None, # to avoid the raise IOError, 'Not a gzipped file' exceptions.IOError: Not a gzipped file
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
'tutorial.middlewares.ProxyMiddleware': 100
}
USER_AGENT_LIST=[
{'User-agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36'},
{
'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'},
{'User-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:35.0) Gecko/20100101 Firefox/35.0'},
{'User-agent': 'Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0'}
]
EDIT II
it's seems that tor use the same ip each time and there is not rotation on the ip address. I don't know what I can change in my middlwares file to resolve this !! please any idea ?