I am learning to use the scrapy to get the job positions information on linkedin. Now I think I can login with the scrapy, and reach the right page which contain the job link information. However, when I try to use the xpath to select the job link information, it return wrong value. Anyone can help me out?
Here is my code:
import scrapy
from scrapy.http import Request, FormRequest
class LinkedinSpider2(scrapy.Spider):
name = "linkedin2"
allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
start_url = 'http://www.linkedin.com/jobs/search/?keywords=data%20analyst&location=United%20States&locationId=us%3A0'
def start_requests(self):
self.log("start_request")
#"""This function is called before crawling starts."""
yield Request(url=self.login_page, callback=self.login, dont_filter=True)
def login(self, response):
#"""Generate a login request."""
return FormRequest.from_response(response,
formdata={'session_key': '***@gmail.com', 'session_password': 'password'},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "My Network" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
return Request(url=self.start_url, callback=self.parse_item)
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
def parse_item(self, response):
self.log(response.url)
if 'Cognius' in response.body:
self.log('***right page***')
self.log(response.xpath("//a/@href").extract())
else:
self.log('***wrong page***')
Here is the output: enter image description here
Here is the source of the page: enter image description here