0

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem

class criticspider(CrawlSpider):
    name ="comp"
    allowed_domains =["consumercomplaints.in"]
    #start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
    start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
    rules=(
        Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
        #Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
    )
    def parse(self,response):
        hxs = Selector(response)
        sites = hxs.select('//table[@width="100%"]')
        items = []

        for site in sites:
            item = CompItem()
            item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
            item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield Request(item['link'],
                    meta={'item': item},
                    callback=self.anchor_page)

            # item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
            # item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
            # item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
            # item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
            items.append(item)


    def anchor_page(self, response):
        hxs = Selector(response)
        old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
        # parse some more values
        #place them in old_item
        #e.g
        old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
        yield old_item

1 Answers1

0

Are you using an old version of Scrapy?

In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().

I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:

link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
    item['link'] = link[0]

You probably want to do a similar thing for title too.

EDIT: I got it working with a few changes:

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin


class CompItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    data = scrapy.Field()


class criticspider(CrawlSpider):
    name = "comp"
    allowed_domains = ["consumercomplaints.in"]
    start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
    rules = (
        Rule(
            SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
            callback="parse",
            follow=True),
    )

    def parse(self, response):
        sites = response.xpath('//table[@width="100%"]')
        items = []

        for site in sites:
            item = CompItem()
            item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
            item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield scrapy.Request(item['link'],
                                     meta={'item': item},
                                     callback=self.anchor_page)

            items.append(item)

    def anchor_page(self, response):
        old_item = response.request.meta['item']

        old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
        yield old_item
Elias Dorneles
  • 22,556
  • 11
  • 85
  • 107