I'm having trouble with my spider, the way I have set it up doesn't seem to work. The spider should be able to scrape multiple pages (1,2,3), all on the same website. I'm not sure if I should do a for loop or an if/else statement so extract all the data? I'm getting this code after I run it: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min).
Any help would be greatly appreciated!
Shown below are the code for the spider, items.py, and pipelines.py:
class abcSpider(scrapy.Spider):
name = 'abc'
page_number = 2
allowed_domains = ['']
def parse(self, response):
items = folder1Item()
deal_number_var = response.css(".mclbEl a::text").extract()
deal_type_var = response.css('.#ContentContainer1_ctl00_Content_ListCtrl1_LB1_VDTBL .mclbEl:nth-child(9)').css('::text').extract()
items['deal_number_var'] = deal_number_var
items['deal_type_var'] = deal_type_var
yield items
next_page = '' + str(abcSpider.page_number) + '/'
if abcSpider.page_number < 8:
abcSpider.page_number += 1
yield response.follow(next_page, callback=self.parse)
This is my items.py page:
import scrapy
class folder1Item(scrapy.Item):
deal_number_var = scrapy.Field()
deal_type_var = scrapy.Field()
I would like to save the data as a .db file to import into sqlite3. It looks like this in my pipelines.py:
import sqlite3
class folder1Pipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = sqlite3.connect("abc.db")
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS abc_tb""")
self.curr.execute("""create table abc_tb(deal_number_var text, deal_type_var text)""")
def process_item(self, items, spider):
self.store_db(items)
return items
def store_db(self,items):
self.curr.execute("""insert into abc_tb values (?,?,?)""" , (items['deal_number_var'][0], items['deal_type_var'][0]))
self.conn.commit()
Middleware.py code:
from scrapy.http import HtmlResponse
from selenium import webdriver
class JSMiddleware(object):
def process_request(self, request, spider):
driver = webdriver.PhantomJS()
driver.get(request.url)
body = driver.page_source
return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)