I was trying to scrap link which has ajax call for pagination. I am trying to crawl http://www.demo.com link. and in .py file I provided this code for restrict XPATH and coding is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import sumSpider, Rule
from scrapy.selector import HtmlXPathSelector
from sum.items import sumItem
class Sumspider1(sumSpider):
name = 'sumDetailsUrls'
allowed_domains = ['sum.com']
start_urls = ['http://www.demo.com']
rules = (
Rule(LinkExtractor(restrict_xpaths='.//ul[@id="pager"]/li[8]/a'), callback='parse_start_url', follow=True),
)
#use parse_start_url if your spider wants to crawl from first page , so overriding
def parse_start_url(self, response):
print '********************************************1**********************************************'
#//div[@class="showMoreCars hide"]/a
#.//ul[@id="pager"]/li[8]/a/@href
self.log('Inside - parse_item %s' % response.url)
hxs = HtmlXPathSelector(response)
item = sumItem()
item['page'] = response.url
title = hxs.xpath('.//h1[@class="page-heading"]/text()').extract()
print '********************************************title**********************************************',title
urls = hxs.xpath('.//a[@id="linkToDetails"]/@href').extract()
print '**********************************************2***url*****************************************',urls
finalurls = []
for url in urls:
print '---------url-------',url
finalurls.append(url)
item['urls'] = finalurls
return item
My items.py file contains
from scrapy.item import Item, Field
class sumItem(Item):
# define the fields for your item here like:
# name = scrapy.Field()
page = Field()
urls = Field()
Still I'm not getting exact output not able to fetch all pages when I am crawling it.