1

how to access Spider command line arguments in the parse function in scrapy tool

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
import string
import xlrd, xlwt
import time


import json
class Myspider(BaseSpider):
    name="doctor"
    allowed_domain = ["tolexo.org"]

    #start_urls=["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%1"]

    def __init__(self, pageno='', excelsheetname='',*args, **kwargs):
        super(Myspider, self).__init__(*args, **kwargs)
        self.start_urls =["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d",pageno]
        page=int(pageno)
        self.excelname=excelsheetname
        self.page=int(pageno)






    workbook=xlwt.Workbook()
    sheet = workbook.add_sheet('Sheet1')

    style=xlwt.easyxf('font : bold 1')
    style2=xlwt.easyxf('font :bold 0')
    sheet.write(0,0,"category",style)
    sheet.col(0).width=256*(30+1)
    sheet.write(0,1,"sub-category1",style)
    sheet.col(1).width=256*(30+1)
    sheet.write(0,2,"sub-category2",style)
    sheet.col(2).width=256*(30+1)
    sheet.write(0,3,"Title",style)
    sheet.col(3).width=256*(30+1)
    sheet.write(0,4,"MRP",style)
    sheet.col(4).width=256*(20+1)
    sheet.write(0,5,"Sale-price",style)
    sheet.col(5).width=256*(20+1)
    sheet.write(0,6,"Image-link",style)
    sheet.col(6).width=256*(60+1)
    rows=0
    cols=7
    specifications={}

    rowsbreak=0
    colsbreak=0
    url=""



    def parse(self,response):
        hxs=HtmlXPathSelector(response)
        url=None
        link= hxs.select("//li[@class='fav-item item']")
        for href in response.xpath("//li[@class='fav-item item']/a/@href"):

            dat=href in response.xpath("//li[@class='fav-item item']/a/@href")
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parse_dir_contents)




        self.workbook.save(self.excelname)
        self.page.
        if(page<260):
                 yield Request(url="http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d" %self.page,
                                  headers={"Referer": "http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=1", "X-Requested-With": "XMLHttpRequest"},
                                  callback=self.parse,
                                  dont_filter=True)


    def parse_dir_contents(self,response):
        self.rows=self.rows+1
        hxs=HtmlXPathSelector(response)
        categories=hxs.select("//div [@class='col-sm-12 a-left']/ul [@typeof='BreadcrumbList']/li/a")
        cat=categories.select('text()').extract()
        cat=[c.strip() for c in cat]
        cat.remove("Home")

        category=cat[0]
        try:
            subcat1=cat[1]
        except:
            subcat1='-'
        try:
            subcat2=cat[2]
        except:
            subcat2='-'

        tit=hxs.select("//div[@class='product-name']/h1")
        title=tit.select('text()').extract()
        titt=title[0]
        mpri=hxs.select("//div[@class='mprice strike']/span")
        if not mpri:
            mpri=hxs.select("//div[@class='mprice strike clearfix']/span")
        spri=hxs.select("//span [@itemprop='price']")
        saleprice=spri.select('text()').extract()
        mrp=mpri.select('text()').extract()
        try:
            mrpp=mrp[0]
        except:
            mrpp="-"
        try:
            sp=saleprice[0]
        except:
            sp="-"
        im=hxs.select("//div[@class='gallery-img']")
        img=im.select('img/@data-img-src').extract()
        try:
            imgg=img[0]
        except:
            img="-"
        pro=hxs.select("//table[@class='product-spec']//td").extract()
        pro1=hxs.select("//table[@class='product-spec']//th").extract()
        pro_des=[]
        pro_sep=[]
        sep="View"
        print category+"--->"+subcat1+"----->"+subcat2+"----->"+titt+"----->"+mrpp+"---->"+sp
        import re
        for p in pro:
         ppp=re.sub('<[^>]*>', '', p)
         ppp=ppp.split(sep,1)[0]
         ppp=ppp.strip()
         pro_des.append(ppp)

        for pp in pro1:
            proo=re.sub('<[^>]*>', '', pp)
            proo=proo.strip()
            pro_sep.append(proo)

        print pro_sep
        cat_len=len(cat)
        title_len=len(title)
        mrp_len=len(mrp)
        saleprice_len=len(saleprice)
        img_len=len(img)
        try:
            self.sheet.write(self.rows,0,category,self.style2)

            self.sheet.write(self.rows,1,subcat1,self.style2)

            self.sheet.write(self.rows,2,subcat2,self.style2)

            self.sheet.write(self.rows,3,titt,self.style2)

            self.sheet.write(self.rows,4,mrpp,self.style2)

            self.sheet.write(self.rows,5,sp,self.style2)

            self.sheet.write(self.rows,6,imgg,self.style2)
        except:
            print

        for p,pp in zip(pro_sep,pro_des):
              try:
                if p in self.specifications:
                    self.sheet.write(self.rows,self.specifications.get(p),pp,self.style2)
                else:
                    self.specifications.update({p:self.cols})
                    self.sheet.write(0,self.cols,p,self.style)
                    self.sheet.write(self.rows,self.cols,pp,self.style2)
                    self.cols=self.cols+1
              except:
                  print
              self.rowsbreak=self.rows
              self.colsbreak=self.cols
              self.urlbreak=str(response)
paul trmbrth
  • 20,518
  • 4
  • 53
  • 66
  • may be it will help you http://stackoverflow.com/questions/20482526/scrapy-how-to-pass-list-of-arguments-through-command-prompt-to-spider – Danil Apr 05 '16 at 16:36

0 Answers0