how to access Spider command line arguments in the parse function in scrapy tool
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.http import Request
import string
import xlrd, xlwt
import time
import json
class Myspider(BaseSpider):
name="doctor"
allowed_domain = ["tolexo.org"]
#start_urls=["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%1"]
def __init__(self, pageno='', excelsheetname='',*args, **kwargs):
super(Myspider, self).__init__(*args, **kwargs)
self.start_urls =["http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d",pageno]
page=int(pageno)
self.excelname=excelsheetname
self.page=int(pageno)
workbook=xlwt.Workbook()
sheet = workbook.add_sheet('Sheet1')
style=xlwt.easyxf('font : bold 1')
style2=xlwt.easyxf('font :bold 0')
sheet.write(0,0,"category",style)
sheet.col(0).width=256*(30+1)
sheet.write(0,1,"sub-category1",style)
sheet.col(1).width=256*(30+1)
sheet.write(0,2,"sub-category2",style)
sheet.col(2).width=256*(30+1)
sheet.write(0,3,"Title",style)
sheet.col(3).width=256*(30+1)
sheet.write(0,4,"MRP",style)
sheet.col(4).width=256*(20+1)
sheet.write(0,5,"Sale-price",style)
sheet.col(5).width=256*(20+1)
sheet.write(0,6,"Image-link",style)
sheet.col(6).width=256*(60+1)
rows=0
cols=7
specifications={}
rowsbreak=0
colsbreak=0
url=""
def parse(self,response):
hxs=HtmlXPathSelector(response)
url=None
link= hxs.select("//li[@class='fav-item item']")
for href in response.xpath("//li[@class='fav-item item']/a/@href"):
dat=href in response.xpath("//li[@class='fav-item item']/a/@href")
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
self.workbook.save(self.excelname)
self.page.
if(page<260):
yield Request(url="http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=%d" %self.page,
headers={"Referer": "http://www.tolexo.com/medical-supplies.html?dir=asc&limit=96&mode=grid&order=bestsellers&p=1", "X-Requested-With": "XMLHttpRequest"},
callback=self.parse,
dont_filter=True)
def parse_dir_contents(self,response):
self.rows=self.rows+1
hxs=HtmlXPathSelector(response)
categories=hxs.select("//div [@class='col-sm-12 a-left']/ul [@typeof='BreadcrumbList']/li/a")
cat=categories.select('text()').extract()
cat=[c.strip() for c in cat]
cat.remove("Home")
category=cat[0]
try:
subcat1=cat[1]
except:
subcat1='-'
try:
subcat2=cat[2]
except:
subcat2='-'
tit=hxs.select("//div[@class='product-name']/h1")
title=tit.select('text()').extract()
titt=title[0]
mpri=hxs.select("//div[@class='mprice strike']/span")
if not mpri:
mpri=hxs.select("//div[@class='mprice strike clearfix']/span")
spri=hxs.select("//span [@itemprop='price']")
saleprice=spri.select('text()').extract()
mrp=mpri.select('text()').extract()
try:
mrpp=mrp[0]
except:
mrpp="-"
try:
sp=saleprice[0]
except:
sp="-"
im=hxs.select("//div[@class='gallery-img']")
img=im.select('img/@data-img-src').extract()
try:
imgg=img[0]
except:
img="-"
pro=hxs.select("//table[@class='product-spec']//td").extract()
pro1=hxs.select("//table[@class='product-spec']//th").extract()
pro_des=[]
pro_sep=[]
sep="View"
print category+"--->"+subcat1+"----->"+subcat2+"----->"+titt+"----->"+mrpp+"---->"+sp
import re
for p in pro:
ppp=re.sub('<[^>]*>', '', p)
ppp=ppp.split(sep,1)[0]
ppp=ppp.strip()
pro_des.append(ppp)
for pp in pro1:
proo=re.sub('<[^>]*>', '', pp)
proo=proo.strip()
pro_sep.append(proo)
print pro_sep
cat_len=len(cat)
title_len=len(title)
mrp_len=len(mrp)
saleprice_len=len(saleprice)
img_len=len(img)
try:
self.sheet.write(self.rows,0,category,self.style2)
self.sheet.write(self.rows,1,subcat1,self.style2)
self.sheet.write(self.rows,2,subcat2,self.style2)
self.sheet.write(self.rows,3,titt,self.style2)
self.sheet.write(self.rows,4,mrpp,self.style2)
self.sheet.write(self.rows,5,sp,self.style2)
self.sheet.write(self.rows,6,imgg,self.style2)
except:
print
for p,pp in zip(pro_sep,pro_des):
try:
if p in self.specifications:
self.sheet.write(self.rows,self.specifications.get(p),pp,self.style2)
else:
self.specifications.update({p:self.cols})
self.sheet.write(0,self.cols,p,self.style)
self.sheet.write(self.rows,self.cols,pp,self.style2)
self.cols=self.cols+1
except:
print
self.rowsbreak=self.rows
self.colsbreak=self.cols
self.urlbreak=str(response)