I'm trying to get my output to look like the following in json format.
{"loser": "De Schepper K." ,"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
But I'm currently getting individual lines for each loser item and winner item. I would like both winner and loser to be on the same line with the url.
{"loser": "De Schepper K.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"winner": "Herbert P.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
{"loser": "Sugita Y.", "url":
"https://www.sofascore.com/tennis/2018-02-07"}
I'm not sure if it's my selectors that's causing this behaviour but I'd like to know how I can customise the pipelines so the loser, winner and date are all on the same json line
I've never extracted json format before so it's new to me. How do you specify what json keys and values will be on each line using custom pipeline?
I also tried to use csv item exporter to do this and got strange behaviour too. ref Scrapy output is showing empty rows per column
Here's my spider.py
import scrapy
from scrapy_splash import SplashRequest
from scrapejs.items import SofascoreItemLoader
from scrapy import Spider
import json
from scrapy.http import Request, FormRequest
class MySpider(scrapy.Spider):
name = "jsscraper"
start_urls = ["https://www.sofascore.com/tennis/2018-02-07"]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url,
callback=self.parse,
endpoint='render.html',
args={'wait': 1.5})
def parse(self, response):
for row in response.css('.event-team'):
il = SofascoreItemLoader(selector=row)
il.add_css('winner' , '.event-team:nth-
child(2)::text')
il.add_css('loser' , '.event-team:nth-
child(1)::text')
il.add_value('url', response.url)
yield il.load_item()
items.py
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from operator import methodcaller
from scrapy import Spider, Request, Selector
class SofascoreItem(scrapy.Item):
loser = scrapy.Field()
winner = scrapy.Field()
url = scrapy.Field()
class SofascoreItemLoader(ItemLoader):
default_item_class = SofascoreItem
default_input_processor = MapCompose(methodcaller('strip'))
default_output_processor = TakeFirst()
pipeline.py
import json
import codecs
from collections import OrderedDict
class JsonPipeline(object):
def __init__(self):
self.file = codecs.open('data_utf8.json' , 'w' ,
encoding='utf-8')
def process_item(self , item , spider):
line = json.dumps(OrderedDict(item) , ensure_ascii=False ,
sort_keys=False) + "\n"
self.file.write(line)
return item
def close_spider(self , spider):
self.file.close()