1

I tried scrapy-splash with http://www.google.com and followed all the prerequisite steps given in the following Github Repo https://github.com/scrapy-plugins/scrapy-splash and i was able to render the Google page.

However when i tired the same http://www.google.com by integrating crawlera with the scrapy-splash as mentioned in the following Github Repo https://github.com/scrapinghub/sample-projects/tree/master/splash_crawlera_example, i am getiing 504 Timeout Exception always

The default sample url http://quotes.toscrape.com/js/ mentioned in the splash_crawlera_example is successfully getting rendered through crawlera, but not the Google, is there anything that needs to be changed with the script to render the Google page?

here is the quotes-js.py

from pkgutil import get_data
import scrapy
from scrapy_splash import SplashRequest
from w3lib.http import basic_auth_header


class QuotesJsSpider(scrapy.Spider):
    name = 'quotes-js'

    def __init__(self, *args, **kwargs):
        # to be able to load the Lua script on Scrapy Cloud, make sure your
        # project's setup.py file contains the "package_data" setting, similar
        # to this project's setup.py
        self.LUA_SOURCE = get_data(
            'splash_crawlera_example', 'scripts/crawlera.lua'
        ).decode('utf-8')
        super(QuotesJsSpider, self).__init__(*args, **kwargs)

    def start_requests(self):
        yield SplashRequest(
            # url='http://quotes.toscrape.com/js/',
            url='http://www.google.com',
            endpoint='execute',
            splash_headers={
                'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
            },
            args={
                'lua_source': self.LUA_SOURCE,
                'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                'wait': 0.5, 'viewport': '1024x2480', 'images': 0, 'timeout': 90
            },
            # tell Splash to cache the lua script, to avoid sending it for every request
            cache_args=['lua_source'],
        )

    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = response.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield SplashRequest(
                url=response.urljoin(next_page),
                endpoint='execute',
                splash_headers={
                    'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
                },
                args={
                    'lua_source': self.LUA_SOURCE,
                    'crawlera_user': self.settings['CRAWLERA_APIKEY'],
                },
                cache_args=['lua_source'],
            )

Settings.py

# -*- coding: utf-8 -*-

BOT_NAME = 'splash_crawlera_example'
SPIDER_MODULES = ['splash_crawlera_example.spiders']
NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'

SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DOWNLOADER_MIDDLEWARES = {
    'scrapy_crawlera.CrawleraMiddleware': 300,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

CRAWLERA_APIKEY = ''  # Your crawlera API key

# Splash settings
SPLASH_URL = 'http://localhost:8050/'     # Splash instance URL from Scrapy Cloud
SPLASH_APIKEY = ''  # Your API key for the Splash instance hosted on Scrapy Cloud
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'


CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 1800
DOENLOAD_DELAY = 1

DEFAULT_HEADERS = {
'X-Crawlera-Max-Retries': 0
}

crawlera.lua

function use_crawlera(splash)
    -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
    -- Have a look at the file spiders/quotes-js.py to see how to do it.
    -- Find your Crawlera credentials in https://app.scrapinghub.com/
    local user = splash.args.crawlera_user

    local host = 'proxy.crawlera.com'
    local port = 8010
    local session_header = 'X-Crawlera-Session'
    local session_id = 'create'

    splash:on_request(function (request)
        -- The commented code below can be used to speed up the crawling
        -- process. They filter requests to undesired domains and useless
        -- resources. Uncomment the ones that make sense to your use case
        -- and add your own rules.

        -- Discard requests to advertising and tracking domains.
        -- if string.find(request.url, 'doubleclick%.net') or
        --    string.find(request.url, 'analytics%.google%.com') then
        --     request.abort()
        --     return
        -- end

        -- Avoid using Crawlera for subresources fetching to increase crawling
        -- speed. The example below avoids using Crawlera for URLS starting
        -- with 'static.' and the ones ending with '.png'.
        -- if string.find(request.url, '://static%.') ~= nil or
        --    string.find(request.url, '%.png$') ~= nil then
        --     return
        -- end

        request:set_header('X-Crawlera-Cookies', 'disable')
        request:set_header(session_header, session_id)
        request:set_proxy{host, port, username=user, password=''}
    end)

    splash:on_response_headers(function (response)
        if type(response.headers[session_header]) ~= nil then
            session_id = response.headers[session_header]
        end
    end)
end

function main(splash)
    use_crawlera(splash)
    splash:go(splash.args.url)
    return splash:html()
end

0 Answers0