I am making a scraper with scrapy python 3. My script suppose to scrape a directory of companies.
Sometimes, scrapy don't find a field of item (the email or the website) because directory didn't publish these fields.
I am trying to handle this kind of exception, but as a newbie, it is complicated for me.
I tried to fix that with some "if". But my problem is to handle it in my pipeline.py file where I add the values in my MYSQL database.
This is my spider file:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from ..items import ScrapingEntreprisesItem
from urlextract import URLExtract
class UsinedigitaleSpider(scrapy.Spider):
name = 'usinedigitale'
allowed_domains = ['usine-digitale.fr']
start_urls = ['https://www.usine-digitale.fr/annuaire-start-up/']
def parse(self, response):
urls = response.xpath("//a[@class='contenu']/@href").extract()
for url in urls:
absolute_url = response.urljoin(url)
print('Voici absolute url :' + absolute_url)
yield Request(absolute_url, callback=self.parse_startup)
next_page = response.xpath("//a[@rel='next']/@href").extract_first().strip()
absolute_next_page_url = response.urljoin(next_page)
yield Request(absolute_next_page_url)
def parse_startup(self, response):
items = ScrapingEntreprisesItem()
startup_name = response.xpath("//h1/text()")
startup_date_creation = response.xpath("//*[@itemprop='foundingDate']/@content")
startup_website = response.xpath("//*[@id='infoPratiq']//a/@href")
startup_email = response.xpath("//*[@itemprop='email']/text()")
startup_address = response.xpath("//p[@itemprop='address']/text()")
startup_founders = response.xpath("//*[@itemprop='founders']/p/text()")
startup_market = response.xpath("//*[@id='ficheStartUp']/div[1]/article/div[6]/p")
startup_description = response.xpath("//*[@itemprop='description']/p/text()")
startup_short_description = response.xpath("//*[@itemprop='review']/p")
if startup_name:
items["startup_name"] = startup_name.extract()
else:
items["startup_name"] = ''
if startup_date_creation:
items["startup_date_creation"] = startup_date_creation.extract()
else:
items["startup_date_creation"] = ''
if startup_website:
startup_website = startup_website.extract()
extractor = URLExtract()
startup_website = extractor.find_urls(str(startup_website[0]))
items["startup_website"] = startup_website
else:
items["startup_website"] = ''
if startup_email:
items["startup_email"] = startup_email.extract()
else:
items["startup_email"] = ''
if startup_address:
items["startup_address"] = startup_address.extract()
else:
items["startup_address"] = ''
if startup_founders:
items["startup_founders"] = startup_founders.extract()
else:
items["startup_founders"] = ''
if startup_market:
items["startup_market"] = startup_market.extract()
else:
items["startup_market"] = ''
if startup_description:
items["startup_description"] = startup_description.extract()
else:
items["startup_description"] = ''
if startup_short_description:
items["startup_short_description"] = startup_short_description.extract()
else:
items["startup_short_description"] = ''
yield items
This is my pipelines.py:
# -*- coding: utf-8 -*-
import mysql.connector
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapingEntreprisesPipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
self.conn=mysql.connector.connect(
host = "mysql.com",
port="3306",
user = "username",
password = "passpass",
database='db123'
)
self.curr=self.conn.cursor()
self.curr = self.conn.cursor(buffered=True)
def process_item(self, item, spider):
sql = "select * from entreprises where website = '" + item["startup_website"][0] + "' limit 1"
print('sql : ' + sql)
self.curr.execute(sql, multi=True)
if self.curr.rowcount == 1:
print("Entreprise found")
else:
print("Entreprise NOT found")
self.curr.execute("""insert into entreprises values (default,%s,%s,%s,%s,%s,%s,%s,%s,%s )""",(
str(item["startup_name"][0]),
str(item["startup_date_creation"][0]),
str(item["startup_website"][0]),
str(item["startup_email"][0]),
str(item["startup_address"][0]),
str(item["startup_founders"][0]),
str(item["startup_market"][0]),
str(item["startup_description"][0]),
str(item["startup_short_description"][0])
))
self.conn.commit()
print("DB mise à jour!!!")
return item
And this is my items.py:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ScrapingEntreprisesItem(scrapy.Item):
# define the fields for your item here like:
startup_name = scrapy.Field()
startup_date_creation = scrapy.Field()
startup_website = scrapy.Field()
startup_email = scrapy.Field()
startup_address = scrapy.Field()
startup_founders = scrapy.Field()
startup_market = scrapy.Field()
startup_description = scrapy.Field()
startup_short_description = scrapy.Field()
So I get several issues: Or I get some Keyerror or I get some List error or it is mysql which can't add dictionary in a text field.
Also my scraper is stopping after hundreds of requests whereas there are thousands of urls to scrape. But this is another topic (just in case you see the reason why).
Thanks in advance for trying to help me.