I have a scrapy spider that looks for static html files on disk using the file:///
command as a start url, but I'm unable to load the gzip files and loop through my directory of 150,000 files which all have the .html.gz
suffix, I've tried several different approaches that I have commented out but nothing works so far, my code so far looks as
from scrapy.spiders import CrawlSpider
from Scrapy_new.items import Scrapy_newTestItem
import gzip
import glob
import os.path
class Scrapy_newSpider(CrawlSpider):
name = "info_extract"
source_dir = '/path/to/file/'
allowed_domains = []
start_urls = ['file://///path/to/files/.*html.gz']
def parse_item(self, response):
item = Scrapy_newTestItem()
item['user'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div[2]/div/div[2]/div[1]/h1/span[2]/text()').extract()
item['list_of_links'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div[2]/div/div[3]/div[3]/a/@href').extract()
item['list_of_text'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div/div/div/div/a/text()').extract()
Running this gives the error code
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/twisted/internet/defer.py", line 150, in maybeDeferred
result = f(*args, **kw)
File "/usr/local/lib/python2.7/site-packages/scrapy/core/downloader/handlers/file.py", line 13, in download_request
with open(filepath, 'rb') as fo:
IOError: [Errno 2] No such file or directory: 'path/to/files/*.html'
Changing my code so that the files are first unziped and then passed through as follow:
source_dir = 'path/to/files/'
for src_name in glob.glob(os.path.join(source_dir, '*.gz')):
base = os.path.basename(src_name)
with gzip.open(src_name, 'rb') as infile:
#start_urls = ['/path/to/files*.html']#
file_cont = infile.read()
start_urls = file_cont#['file:////file_cont']
Gives the following error:
Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/scrapy/core/engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 70, in start_requests
yield self.make_requests_from_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 73, in make_requests_from_url
return Request(url, dont_filter=True)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: %3C