I tried scrapy-splash with http://www.google.com and followed all the prerequisite steps given in the following Github Repo https://github.com/scrapy-plugins/scrapy-splash and i was able to render the Google page.
However when i tired the same http://www.google.com by integrating crawlera with the scrapy-splash as mentioned in the following Github Repo https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example, i am getiing 504 Timeout Exception always
The default sample url http://quotes.toscrape.com/js/ mentioned in the splash_crawlera_example is successfully getting rendered through crawlera, but not the Google, is there anything that needs to be changed with the script to render the Google page?
here is the quotes-js.py
from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header
class QuotesJsSpider(scrapy.Spider):
name = 'quotes-js'
def __init__(self, *args, **kwargs):
# to be able to load the Lua script on Scrapy Cloud, make sure your
# project's setup.py file contains the "package_data" setting, similar
# to this project's setup.py
self.LUA_SOURCE = get_data(
'splash_crawlera_example', 'scripts/crawlera.lua'
).decode('utf-8')
super(QuotesJsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
yield SplashRequest(
# url='http://quotes.toscrape.com/js/',
url='http://www.google.com',
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
},
# tell Splash to cache the lua script, to avoid sending it for every request
cache_args=['lua_source'],
)
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('span small::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
next_page = response.css('li.next > a::attr(href)').extract_first()
if next_page:
yield SplashRequest(
url=response.urljoin(next_page),
endpoint='execute',
splash_headers={
'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
},
args={
'lua_source': self.LUA_SOURCE,
'crawlera_user': self.settings['CRAWLERA_APIKEY'],
},
cache_args=['lua_source'],
)
Settings.py
# -*- coding: utf-8 -*-
BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DOWNLOADER_MIDDLEWARES = {
'scrapy_crawlera.CrawleraMiddleware': 300,
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
CRAWLERA_APIKEY = '' # Your crawlera API key
# Splash settings
SPLASH_URL = 'http://localhost:8050/' # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1
DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}
crawlera.lua
function use_crawlera(splash)
-- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
-- Have a look at the file spiders/quotes-js.py to see how to do it.
-- Find your Crawlera credentials in https://app.scrapinghub.com/
local user = splash.args.crawlera_user
local host = 'proxy.crawlera.com'
local port = 8010
local session_header = 'X-Crawlera-Session'
local session_id = 'create'
splash:on_request(function (request)
-- The commented code below can be used to speed up the crawling
-- process. They filter requests to undesired domains and useless
-- resources. Uncomment the ones that make sense to your use case
-- and add your own rules.
-- Discard requests to advertising and tracking domains.
-- if string.find(request.url, 'doubleclick%.net') or
-- string.find(request.url, 'analytics%.google%.com') then
-- request.abort()
-- return
-- end
-- Avoid using Crawlera for subresources fetching to increase crawling
-- speed. The example below avoids using Crawlera for URLS starting
-- with 'static.' and the ones ending with '.png'.
-- if string.find(request.url, '://static%.') ~= nil or
-- string.find(request.url, '%.png$') ~= nil then
-- return
-- end
request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=''}
end)
splash:on_response_headers(function (response)
if type(response.headers[session_header]) ~= nil then
session_id = response.headers[session_header]
end
end)
end
function main(splash)
use_crawlera(splash)
splash:go(splash.args.url)
return splash:html()
end