using scrapy extracting data inside links

Question

I have been trying to extract data from consumercomplaints.in the title and the data inside those title links.I wrote the following code and unable to parse through the links and extract the data and also I am unable to extract all the links related.plz guide

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem

class criticspider(CrawlSpider):
    name ="comp"
    allowed_domains =["consumercomplaints.in"]
    #start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
    start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
    rules=(
        Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
        #Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
    )
    def parse(self,response):
        hxs = Selector(response)
        sites = hxs.select('//table[@width="100%"]')
        items = []

        for site in sites:
            item = CompItem()
            item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
            item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield Request(item['link'],
                    meta={'item': item},
                    callback=self.anchor_page)

            # item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
            # item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
            # item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
            # item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
            items.append(item)


    def anchor_page(self, response):
        hxs = Selector(response)
        old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
        # parse some more values
        #place them in old_item
        #e.g
        old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
        yield old_item

Elias Dorneles · Accepted Answer · 2015-01-23T10:44:53.843

Are you using an old version of Scrapy?

In the latest stable version you don't need to do hxs = Selector(response) nor using the hxs.select() method. You can do the same thing just with response.xpath().

I think the problem in your code is that the result of select() (or response.xpath) is actually a Python list, so you need to do:

link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
    item['link'] = link[0]

You probably want to do a similar thing for title too.

EDIT: I got it working with a few changes:

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin


class CompItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    data = scrapy.Field()


class criticspider(CrawlSpider):
    name = "comp"
    allowed_domains = ["consumercomplaints.in"]
    start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
    rules = (
        Rule(
            SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
            callback="parse",
            follow=True),
    )

    def parse(self, response):
        sites = response.xpath('//table[@width="100%"]')
        items = []

        for site in sites:
            item = CompItem()
            item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
            item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
            if item['link']:
                if 'http://' not in item['link']:
                    item['link'] = urljoin(response.url, item['link'])
                yield scrapy.Request(item['link'],
                                     meta={'item': item},
                                     callback=self.anchor_page)

            items.append(item)

    def anchor_page(self, response):
        old_item = response.request.meta['item']

        old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
        yield old_item

@NikhilParmar I edited your example and posted a working version — Elias Dorneles, Jan 23 '15 at 10:45
thankyou so much it worked ,can you tell me what does extract().[0]? — Nikhil Parmar, Jan 24 '15 at 13:46
@NikhilParmar ``extract()`` returns a list, the ``[0]`` part gets the first element of it -- I tried to explain that in the answer too. If my answer was useful, please mark it as accepted (the V button under the voting arrows). — Elias Dorneles, Jan 24 '15 at 18:16
hey can you help me with this question-http://stackoverflow.com/questions/28102472/scrapy-needs-to-crawl-all-the-next-links-on-website-and-move-on-to-the-next-page — Nikhil Parmar, Jan 25 '15 at 15:14

using scrapy extracting data inside links

1 Answers1

Linked