Link Extractor in Scrapy with process_value

Question

I am trying to extract the data from myntra.com using scrapy. My code till now -

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class VideoSpider(CrawlSpider):
    name = 'video'
    allowed_domains = ['myntra.com']

    user_agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'

    # def __init__(self, url = ""):
    #         # self.input = input  # source file name
    #         self.url = url
    #         # self.last = last

    def start_requests(self):
        # yield scrapy.Request(url='https://www.amazon.in/gp/bestsellers/videogames/ref=zg_bs_nav_0', headers={
        #     'User-Agent': self.user_agent
        # })
        yield scrapy.Request(url=self.url, headers={
            'User-Agent': self.user_agent
        }, callback=self.parse)
    #    with open("./Input/amazon.csv") as f:
    #     for line in f:
    #         category, url = line.split(',')
    #         category = category.strip()
    #         url = url.strip()
    #         yield scrapy.Request(url=url, headers={
    #             'User-Agent': self.user_agent
    #         }, meta={'urlkey':category})


    rules = (
        Rule(LinkExtractor(restrict_xpaths="//li[@class='product-base']", process_value=lambda x :"https://www.myntra.com/" +x), callback='parse_item', follow=True, process_request='set_user_agent'), # have tried //li[@class='product-base']/a/@href and //li[@class='product-base']/a[1] as well for restricted_xpaths
        Rule(LinkExtractor(restrict_xpaths="//li[@class='pagination-next']/a"), process_request='set_user_agent')
    )

    # def parse_start(self, response):
    #     print(response)
    #     all_links = response.xpath('//li[@class="product-base"]/a/@href').extract()
    #     print(all_links)
    #     for link in all_links:
    #         yield scrapy.Request(url='myntra.com'+link, callback=self.parse_item)
        # return super().parse_start_url(response)
    # def parse_fail(self, response):
    #     print(response.url)
        # all_links = response.xpath('//li[@class="product-base"]/a/@href').extract()
        # print(all_links)
        # for link in all_links:
        #     yield scrapy.Request(url='myntra.com'+link, callback=self.parse_item)

    def set_user_agent(self, request):
        request.headers['User-Agent'] = self.user_agent
        return request

    # def process_values(self,value):
    #     print(value)
    #     value = "https://www.myntra.com/" + value
    #     print(value)
    #     return value

    # def link_add(self, links):
    #     print(links)


    def parse_item(self, response):
        # yield {
        #     'title':response.xpath("normalize-space(//span[@class='a-size-large']/text())").get(),
        #     'brand':response.xpath("normalize-space(//div[@class='a-section a-spacing-none']/a/text())").get(),
        #     'product-specification':response.xpath("normalize-space(//ul[@class='a-unordered-list a-vertical a-spacing-mini']/li/span/text())").get(),
        #     'product-description':response.xpath("normalize-space(//div[@class='a-row feature']/div[2]/p/text())").get(),
        #     'user-agent':response.request.headers['User-Agent']
        # }
        item = dict()
        item['title'] = response.xpath("//div[@class='pdp-price-info']/h1/text()").extract()
        item['price'] = response.xpath("normalize-space(//span[@class='pdp-price']/strong/text())").extract()
        item['product-specification'] = response.xpath("//div[@class='index-tableContainer']/div/div/text()").extract()
        item['product-specification'] = [p.replace("\t", "") for p in item['product-specification']]
        yield item
        # yield {
        #     'title':response.xpath("normalize-space(//span[@class='a-size-large']/text())").extract(),
        #     'brand':response.xpath("normalize-space(//div[@class='a-section a-spacing-none']/a/text())").extract(),
        #     'product-specification':response.xpath("//ul[@class='a-unordered-list a-vertical a-spacing-mini']/li/span/text()").extract(),
        #     'product-description':response.xpath("normalize-space(//div[@class='a-row feature']/div[2]/p/text())").extract(),
        # }


# //div[@class="search-searchProductsContainer row-base"]//section//ul//li[@class="product-base"]//a//@href

The comments in the code show all my attempts.

Start url passed as url in argument

The xpath for href to be used in link extractor is //li[@class='product-base']/a/@href. But, the issue is the href needs to be appended with https://myntra.com/ ahead of extracted value of link extractor and hence the lambda function for process_value. But, the code doesn't run.

Output

2020-05-26 02:52:12 [scrapy.core.engine] INFO: Spider opened
2020-05-26 02:52:12 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2020-05-26 02:52:12 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2020-05-26 02:52:12 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.myntra.com/robots.txt> (referer: None)
2020-05-26 02:52:13 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.myntra.com/men-footwear> (referer: None)
2020-05-26 02:52:13 [scrapy.core.engine] INFO: Closing spider (finished)
2020-05-26 02:52:13 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 1023,
 'downloader/request_count': 2,
 'downloader/request_method_count/GET': 2,
 'downloader/response_bytes': 87336,
 'downloader/response_count': 2,
 'downloader/response_status_count/200': 2,
 'elapsed_time_seconds': 0.76699,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2020, 5, 25, 21, 22, 13, 437855),
 'log_count/DEBUG': 2,
 'log_count/INFO': 10,
 'log_count/WARNING': 1,
 'memusage/max': 51507200,
 'memusage/startup': 51507200,
 'response_received_count': 2,
 'robotstxt/request_count': 1,
 'robotstxt/response_count': 1,
 'robotstxt/response_status_count/200': 1,
 'scheduler/dequeued': 1,
 'scheduler/dequeued/memory': 1,
 'scheduler/enqueued': 1,
 'scheduler/enqueued/memory': 1,
 'start_time': datetime.datetime(2020, 5, 25, 21, 22, 12, 670865)}
2020-05-26 02:52:13 [scrapy.core.engine] INFO: Spider closed (finished)

Any help will be appreciated.

in `start_url` you use `self.url` and `self.parse` but I don't see where you define them. — furas, May 25 '20 at 22:10
this page uses JavaScript to add items but Scrapy can't run JavaScript - you will need `Scrapy-Selenium` to control real web browser which can run JavaScript. BTW: turn off JavaScript in web browser and load your url in web browser to see what Scrapy can get without `Selenium`. — furas, May 25 '20 at 22:19
@furas, thanks for the reply. For first reply, self.url as mentioned is passed as argument. self.parse is default of scrapy. As you can see, I tried to override it but no help. Problem is response is different. Reason - I guess what you states as JS page. I am using Scrapy for first time. I usually use Selenium. Can you provide me links for Scrapy - Selenium workaround. I have already peeped on the response via response.text and exporting it to file and it is not the desired response. — Plasmatiger, May 25 '20 at 22:37
https://stackoverflow.com/questions/17975471/selenium-with-scrapy-for-dynamic-page - I found this. Okay, so it's like using selenium. I can give it a try — Plasmatiger, May 25 '20 at 22:41
[Scrapy-Selenium](https://github.com/clemfromspace/scrapy-selenium) — furas, May 25 '20 at 22:46
https://stackoverflow.com/questions/30345623/scraping-dynamic-content-using-python-scrapy Using splashJS — Plasmatiger, May 25 '20 at 22:56

furas · Accepted Answer · 2020-05-25T23:38:36.100

This page uses JavaScript to add item but it doesn't read it from external file but it has all data in tag <script>

import requests
from bs4 import BeautifulSoup
import json

base_url = "https://www.myntra.com/men-footwear"

r = requests.get(base_url)

soup = BeautifulSoup(r.text, 'html.parser')

# get .text
scripts = soup.find_all('script')[8].text

# remove window.__myx = 
script = scripts.split('=', 1)[1]

# convert to dictionary
data = json.loads(script)

for item in data['searchData']['results']['products']:
    #print(item.keys())
    #for key, value in item.items():
    #    print(key, '=', value)

    print('product:', item['product'])
    #print('productId:', item['productId'])
    #print('brand:', item['brand'])
    print('url:', 'https://www.myntra.com/' + item['landingPageUrl'])
    print('---')

Result:

product: Puma Men Black Rapid Runner IDP Running Shoes
url: https://www.myntra.com/sports-shoes/puma/puma-men-black-rapid-runner-idp-running-shoes/9005767/buy
---
product: Puma Men White Smash Leather Sneakers
url: https://www.myntra.com/casual-shoes/puma/puma-men-white-smash-leather-sneakers/1966314/buy
---
product: Puma Unisex Grey Escaper Core Running Shoes
url: https://www.myntra.com/sports-shoes/puma/puma-unisex-grey-escaper-core-running-shoes/10137271/buy
---
product: Red Tape Men Brown Leather Derbys
url: https://www.myntra.com/casual-shoes/red-tape/red-tape-men-brown-leather-derbys/10300791/buy
---

EDIT: The same with Scrapy

You can put all code in one file and run python script.py without creating project.

It uses meta to send product data form one parser (which parses main page) to other parser (which parses product page)

import scrapy
import json

class MySpider(scrapy.Spider):

    name = 'myspider'

    start_urls = ['https://www.myntra.com/men-footwear']

    def parse(self, response):
        print('url:', response.url)

        scripts = response.xpath('//script/text()')[9].get()

        # remove window.__myx = 
        script = scripts.split('=', 1)[1]

        # convert to dictionary
        data = json.loads(script)

        for item in data['searchData']['results']['products']:

            info = {
                'product': item['product'],
                'productId': item['productId'],
                'brand': item['brand'],
                'url': 'https://www.myntra.com/' + item['landingPageUrl'],
            }

            #yield info

            yield response.follow(item['landingPageUrl'], callback=self.parse_item, meta={'item': info})

    def parse_item(self, response):
        print('url:', response.url)

        info = response.meta['item']

        # TODO: parse product page with more information

        yield info

# --- run without project and save in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',
    # save in file CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()

thanks a lot for the response. I am looking for solution in scrapy. Thanks anyways. Sorry, I can't mark this as correct answer since, it might be misleading based on title. — Plasmatiger, May 25 '20 at 23:24
you can use Scrapy to get HTML and my example to get data from ` — furas, May 25 '20 at 23:28
thanks a lot. The second page is also a JS which we followed, but after your inputs, I can give it a try — Plasmatiger, May 26 '20 at 07:50

Link Extractor in Scrapy with process_value

1 Answers1