I'm trying to scrape projecteuler.net with python's scrapy library, just to make practice with it. I've seen online more than one existent implementation of such a scraper, but they seem just too much elaborated for me. I want simply to save the problems (titles, ids, contents) in a json and next loading with ajax in a local webpage on my pc.
I'm implementing my solution that I will terminate anyway, but since I want to discover the smarter way to use the library, I'm asking you to propose the most intelligent programs with scrapy for doing this job (if you want to avoid the json way, and save directly in html... for me may be even better).
This is my first approach (doesn't work):
# -*- coding: utf-8 -*-
import httplib2
import requests
import scrapy
from eulerscraper.items import Problem
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
def start_urls_detection():
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return ["https://projecteuler.net/"]
class EulerSpider(CrawlSpider):
name = 'euler'
allowed_domains = ['projecteuler.net']
start_urls = start_urls_detection()
rules = (
# Extract links matching 'category.php' (but not matching 'subsection.php')
# and follow links from them (since no callback means follow=True by default).
# Rule(LinkExtractor(allow=('category\.php',), deny=('subsection\.php',))),
Rule(LinkExtractor(allow=('problem=\d*',)), callback="parse_problems"),
Rule(LinkExtractor(allow=('archives;page=\d*',), unique=True), follow=True)
)
def start_requests(self):
# su = ['https://projecteuler.net/archives', 'https://projecteuler.net/archives;page=2']
# i = 1
#
# while True:
# request = requests.get(su[i])
#
# if request.status_code != 200:
# break
#
# i += 1
# su.append('https://projecteuler.net/archives;page=' + str(i + 1))
return [scrapy.Request("https://projecteuler.net/archives", self.parse)]
def parse_problems(self, response):
l = ItemLoader(item=Problem(), response=response)
l.add_css("title", "h2")
l.add_css("id", "#problem_info")
l.add_css("content", ".problem_content")
yield l.load_item()
# def parse_content(self, response):
# #return response.css("div.problem_content::text").extract()
# next_page = "https://projecteuler.net/archives;page=2"
# n = 3
#
# while n < 14:
# next_page = response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
# next_page = next_page[0:len(next_page) - 1] + str(n)
# n += 1
now I will try with some linkExtractor + manual requests combo. In the meantime, I hopefully wait for your solutions...