1

Hello i want to scrape product specification table available on the product page of the link : https://www.amazon.com/dp/B07HJ41HCF for which i have written the following spider in scrapy.

 def parse(self, response):
        item = GraingerItem()
        item['url'] = response.url
        item['proddescription'] = response.xpath('//*[@id="productDetails_detailBullets_sections1"]/td[1]/th/text()').extract()
        item['title'] = response.xpath('//*[@id="productTitle"]/text()').extract()[0].strip()
        try:
            item['sellername'] = response.xpath('//*[@id="bylineInfo"]/text()').extract()[0].strip()
        except IndexError:
            item['sellername'] = "No Seller Name"
        gg=[]
        cc= response.xpath('//*[@class="a-link-normal a-color-tertiary"]')
        for bb in cc:
            dd=bb.xpath('text()').extract()[0].strip()
            gg.append(dd)
            gg.append(">")
        qq=str(gg)
        qr=qq.replace("'","")
        qs = qr.replace(">]","")
        qt=qs.replace("[","")
        qu = qt.replace(",","")
        item['travlink'] = qu
        try:
            item['rating'] = response.xpath('//*[@id="acrPopover"]/span[1]/a/i[1]/span/text()').extract()[0].strip()
        except IndexError:
            item['rating'] = "Be the First one to review"
        try:
            item['Crreview'] = response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract()[0].strip()
        except IndexError:
            item['Crreview'] = "Be the First one to review"
        dd = response.xpath('//*[@id="feature-bullets"]/ul')
        ft = []
        for i in range(2,40):
            q = str(i)
            trows ="li["+q+"]"
            xpathgiven = trows + "/span/text()"
            for bullets in dd:
                b1= bullets.xpath(xpathgiven).extract()
                for ac in b1:
                    ab = ac.replace("\xa0", "")
                ft.append(b1)
                ft.append(";")
            stft = str(ft)
            stft1 = stft.replace("';', [], ';'","")
            stft2 = stft1.replace("\\t","")
            stft3 = stft2.replace('\\n',"")
            stft4 = stft3.replace("'","")
            stft5 = stft4.replace("[","")
            stft6 = stft5.replace("]","")
            stft7 = stft6.replace(",","")
            item['feature'] = stft7
        description = []
        try:

            for i in range(1, 100):
                q1 = str(i)
                trows1 = "[" + q1 + "]"
                xpathgiven1 = "//*[@id='productDescription']/p/text()["+q1+"]"
                gg = response.xpath(xpathgiven1).extract()
                description.append(gg)
                description.append(";")
            stft = str(description)
            dsft1 = stft.replace("';', [], ';'", "")
            dsft2 = dsft1.replace("'], ';', ['", ";")
            dsft3 = dsft2.replace('\\n', "")
            dsft33 = dsft3.replace('\\t', "")
            dsft4 = dsft33.replace("'", "")
            dsft5 = dsft4.replace("[", "")
            dsft6 = dsft5.replace("]", "")
            dsft7 = dsft6.replace(",", "")
            item['Description'] = dsft7
        except IndexError:
            item['Description'] = "No Description"

In the above code everything works fine but the item['proddescription'] does yield an empty list any help with the above will be highly appreciated

codehacker
  • 320
  • 2
  • 3
  • 16
  • Have you tried to parse it on shell? Are your scraper returning the data? – Pankaj Feb 08 '19 at 04:40
  • Please check out another thread on similar issue where I explained in detail. https://stackoverflow.com/questions/54471844/scrapy-spider-finishing-scraping-process-without-scraping-anything/54474019#54474019 – Pankaj Feb 08 '19 at 04:46
  • @PS1212 is there any way out for that – codehacker Feb 08 '19 at 05:41

1 Answers1

0

Worked for your variant:

response.xpath('//*[@id="productDetails_detailBullets_sections1"]/tr/*/text()').re('(\w+[^\n]+)')

enter image description here