I'm trying to scrape the website of a prominent UK retailer, using both Selenium and Scrapy (see code below). I'm getting a [scrapy.core.scraper] ERROR: Spider error processing
and have no idea what else to do (been at it for three hours or so). Thank you for all your support!
import scrapy
from selenium import webdriver
from nl_scrape.items import NlScrapeItem
import time
class ProductSpider(scrapy.Spider):
name = "product_spider"
allowed_domains = ['newlook.com']
start_urls = ['http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false']
def __init__(self):
self.driver = webdriver.Safari()
self.driver.set_window_size(800,600)
time.sleep(4)
def parse(self, response):
self.driver.get(response.url)
time.sleep(4)
# Collect products
products = driver.find_elements_by_class_name('plp-item ng-scope')
# Iterate over products; extract data and append individual features to NlScrapeItem
for item in products:
# Pull features
desc = item.find_element_by_class_name('product-item__name link--nounderline ng-binding').text
href = item.find_element_by_class_name('plp-carousel__img-link ng-scope').get_attribute('href')
# Price Symbol removal and integer conversion
priceString = item.find_element_by_class_name('price ng-binding').text
priceInt = priceString.split('£')[1]
price = float(priceInt)
# Generate a product identifier
identifier = href.split('/p/')[1].split('?comp')[0]
identifier = int(identifier)
# datetime
dt = date.today()
dt = dt.isoformat()
# NlScrapeItem
item = NlScrapeItem()
# Append product to NlScrapeItem
item['id'] = identifier
item['href'] = href
item['description'] = desc
item['price'] = price
item['firstSighted'] = dt
item['lastSighted'] = dt
yield item
self.driver.close()
2017-08-26 15:48:38 [scrapy.core.scraper] ERROR: Spider error processing http://www.newlook.com/uk/womens/clothing/c/uk-womens-clothing?comp=NavigationBar%7Cmn%7Cwomens%7Cclothing#/?q=:relevance&page=1&sort=relevance&content=false> (referer: None)
Traceback (most recent call last): File "/Users/username/Documents/nl_scraping/nl_env/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks current.result = callback(current.result, *args, **kw) File "/Users/username/Documents/nl_scraping/nl_scrape/nl_scrape/spiders/product_spider.py", line 18, in parse products = driver.find_elements_by_class_name('plp-item ng-scope') NameError: name 'driver' is not defined