I have followed the response for this question (Crawling with an authenticated session in Scrapy) to use scrapy with an authenticated session. The problem is that it seems to log in successfully, but when I make a request it seems to be unauthenticated. Any idea on where the problem is?
Here is my python script:
enter code here
import scrapy
from scrapy.spiders.init import InitSpider
from scrapy.utils.response import open_in_browser
class LoginSpider(InitSpider):
name = demo
login_page = #login page
inquery = #search query
start_urls = #urls with queries
def init_request(self):
return scrapy.Request(url=self.login_page, callback=self.login)
def login(self, response):
open_in_browser(response)
return [scrapy.FormRequest.from_response(response,
formid='login-form',
formdata={'username': 'username', 'password': 'password'},
callback=self.after_login)]
def after_login(self, response):
# check login succeed before going on
open_in_browser(response)
if "invalid username or password" in response.body:
self.log("Login failed", level=log.ERROR)
print "FAILED"
return
else:
self.log('authentication succeed')
return scrapy.Request(url=self.inquery, callback=self.parsse)
def parsse(self, response):
for result in response.xpath('//div[@class="span9"]/div[@class="search-result"]/div/a[@class="details"]/@href'):
print 'new resutl'
url = response.urljoin(result.extract())
yield scrapy.Request(url, callback=self.parse_details_contents)
def parse_details_contents(self, response):
item = ShodanItem()
for details in response.xpath('//ul[@class="services"]/li'):
item['ip'] = response.xpath('/html/body/div[3]/div/div[2]/div/div[1]/div/h2/text()').extract()
item['services_arr'][0] = details.xpath('/div[1]/div[1]/text()').extract()
item['services_arr'][1] = details.xpath('/div[1]/div[2]/text()').extract()
item['services_arr'][2] = details.xpath('/div[1]/div[3]/text()').extract()
item['services_arr'][3] = details.xpath('/div[2]/h3/text()').extract()
item['services_arr'][4] = details.xpath('/div[2]/pre/text()').extract()
print item['services_arr'][4]
yield item
Here is the log, I assume it does log in as it redirects to the main page, but afterwards, using the open_in_browser()
command I get a page that asks for authentication in order to use the query:
dsds
2016-07-06 15:07:51 [scrapy] INFO: Spider opened
2016-07-06 15:07:51 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-07-06 15:07:51 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (404) <GET https://account.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Crawled (200) <GET https://account.shodan.io/login> (referer: None)
2016-07-06 15:07:52 [scrapy] DEBUG: Redirecting (302) to <GET https://www.shodan.io/?language=en> from <POST https://account.shodan.io/login>
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/robots.txt> (referer: None)
2016-07-06 15:07:53 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/?language=en> (referer: https://account.shodan.io/login)
2016-07-06 15:07:53 [shodan.io] DEBUG: authentication succeed
2016-07-06 15:07:54 [scrapy] DEBUG: Crawled (200) <GET https://www.shodan.io/search?query=org%3A%22Instituto+Tecnol%C3%B3gico+y+de+Estudios+Superiores+de%22> (referer: https://www.shodan.io/?language=en)
ASDASDASDASDASDASDASDASDASD
2016-07-06 15:07:54 [scrapy] INFO: Closing spider (finished)
2016-07-06 15:07:54 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 2231,
'downloader/request_count': 6,
'downloader/request_method_count/GET': 5,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 11759,
'downloader/response_count': 6,
'downloader/response_status_count/200': 4,
'downloader/response_status_count/302': 1,
'downloader/response_status_count/404': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 7, 6, 20, 7, 54, 214825),
'log_count/DEBUG': 8,
'log_count/INFO': 7,
'request_depth_max': 2,
'response_received_count': 5,
'scheduler/dequeued': 4,
'scheduler/dequeued/memory': 4,
'scheduler/enqueued': 4,
'scheduler/enqueued/memory': 4,
'start_time': datetime.datetime(2016, 7, 6, 20, 7, 51, 797093)}