I have a working script using scrapy that inserts scraped items into a database using a pipelines class. However this seems to slow down the scrape considerably. I'm using the process item method to insert each scraped item into the database as it is scraped. Would it be faster to output the scraped items into a csv file and then use a stored procedure to insert the data into the database?
def process_item(self, item, spider):
if 'address_line_1' in item:
sql = """INSERT dbo.PropertyListings (date, url, ad_type, address_line_1, suburb, state, postcode)
SELECT ?, ?, ?, ?, ?, ?, ?
WHERE NOT EXISTS
( SELECT 1
FROM dbo.PropertyListings
WHERE date = ?
AND address_line_1 = ?
AND suburb = ?
AND state = ?
And postcode = ?
)
"""
self.crsr.execute(sql, item['date'], item['url'], item['ad_type'], item['address_line_1'], item['suburb'], \
item['state'], item['postcode'], item['date'], item['address_line_1'], item['suburb'], item['state'], \
item['postcode'])
self.conn.commit()
else:
sql = """INSERT dbo.PropertyListings (date, url, ad_type, address_line_1, suburb, state, postcode)
SELECT ?, ?, ?, ?, ?, ?, ?
WHERE NOT EXISTS
( SELECT 1
FROM dbo.PropertyListings
WHERE date = ?
AND url = ?
)
"""
self.crsr.execute(sql, item['date'], item['url'], item['ad_type'], '', item['suburb'], \
item['state'], item['postcode'], item['date'], item['url'])
self.conn.commit()
return item