My Scrapy spider uploads results to a Mongo database every 1000 scraped URLs. I'm appending the results to a list before uploading. Given that appending to lists is somewhat slow, is there a way I save the results using list comprehension? Is saving to list fastest?
Here's my (simplified) spider:
class QuotesSpider(scrapy.Spider):
name = "spy"
def __init__(self):
# init MongoDB instance
self.res_list = []
self.urls = self.x.urls(10000)
def start_requests(self):
for url in self.urls:
yield scrapy.Request(url=url, callback=self.parse)
async def do_insert(self, documents):
await self.db['coll'].insert_many(documents)
def parse(self, r):
res = self.x.process(r)
self.res_list.append(res)
if len(self.res_list) > 1000:
url_list = [u['url'] for u in self.res_list]
loop = asyncio.get_event_loop()
loop.run_until_complete(self.do_insert(self.res_list))
print('UPLOADING TO DATABASE...')
self.res_list = []