I am trying to save scraped items in separate json files, but I don't see any output files. The pipeline and the item is defined in the piplines.py and items.py files in the scrapy project folder. Do I have to call process_item() explicitly or will it be called automatically when I return item in scrape()? I enabled the pipeline in CrawlerProcess(settings={'ITEM_PIPELINES'}). Thanks.
The pipeline
import json,datetime
class JsonWriterPipeline(object):
def process_item(self, item, spider):
# return item
fileName = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + '.json'
try:
with open(fileName,'w') as fp:
json.dump(dict(item),fp)
return item
except:
return item
class ProjectItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
class mySpider(CrawlSpider):
name = 'mySPider'
allowed_domains = ['allowedDOmain.org']
start_urls = ['https://url.org']
def parse(self,response):
monthSelector = '//div[@class="archives-column"]/ul/li/a[contains(text(),"November 2019")]/@href'
monthLink = response.xpath(monthSelector).extract_first()
yield response.follow(monthLink,callback=self.scrape)
def scrape(self,response):
# get the links to all individual articles
linkSelector = '.entry-title a::attr(href)'
allLinks = response.css(linkSelector).extract()
for link in allLinks:
# item = articleItem()
item = ProjectItem()
item['url'] = link
request = response.follow(link,callback=self.getContent)
request.meta['item'] = item
item = request.meta['item']
yield item
nextPageSelector = 'span.page-link a::attr(href)'
nextPageLink = response.css(nextPageSelector).extract_first()
yield response.follow(nextPageLink,callback=self.scrape)
def getContent(self,response):
item = response.meta['item']
TITLE_SELECTOR = '.entry-title ::text'
item['title'] = response.css(TITLE_SELECTOR).extract_first()
yield item