I have deployed Scrapyd as docker conainter on Google CloudRun. On my local, when I am running container, everything is working fine. But, when I am deploying same container on Google CloudRun, Spider jobs are not removed from Running queue. Though Jobs are finished but they are not being removed from Queue. Any thoughts?
Asked
Active
Viewed 188 times
2 Answers
0
As mentioned in the Github :
Close the webdriver in spider close function as follows:
def __init__(self, *args, **kwargs):
# webkit driver
self.driver = webdriver.PhantomJS(executable_path=PHANTOMJS, service_log_path='/tmp/ghostdriver.log')
self.driver.implicitly_wait(1)
self.driver.set_page_load_timeout(3)
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(NewsDuowanSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=scrapy.signals.spider_closed)
return spider
def spider_closed(self, spider):
spider.logger.info('Spider closed: %s', spider.name)
spider.driver.quit()
For more information, you can refer to the link and documentation.

Divyani Yadav
- 1,030
- 4
- 9
-
But, I am not using webdriver here. – Shivkumar Agrawal Mar 08 '22 at 17:36
-
have a look on link :https://github.com/scrapy/scrapyd/issues/21 Is it helpful? – Divyani Yadav Mar 10 '22 at 12:54
0
i meet the same issue, when deployed scrapyd on cloud run. The reason may be the sub proccessor exited , but processEnded method not invoked, when add processExited method to kill zhe slot, the issue solved!
i temply solve it by using my own launcher:
class Launcher(Service):
name = 'launcher'
def __init__(self, config, app):
self.processes = {}
self.finished = app.getComponent(IJobStorage)
self.max_proc = self._get_max_proc(config)
self.runner = config.get('runner', 'scrapyd.runner')
self.app = app
def startService(self):
for slot in range(self.max_proc):
self._wait_for_project(slot)
log.msg(
format=
'Scrapyd %(version)s started: max_proc=%(max_proc)r, runner=%(runner)r',
version=__version__,
max_proc=self.max_proc,
runner=self.runner,
system='Launcher')
def _wait_for_project(self, slot):
poller = self.app.getComponent(IPoller)
poller.next().addCallback(self._spawn_process, slot)
def _spawn_process(self, message, slot):
msg = native_stringify_dict(message, keys_only=False)
project = msg['_project']
args = [sys.executable, '-m', self.runner, 'crawl']
args += get_crawl_args(msg)
e = self.app.getComponent(IEnvironment)
env = e.get_environment(msg, slot)
env = native_stringify_dict(env, keys_only=False)
pp = ScrapyProcessProtocol(slot, project, msg['_spider'], \
msg['_job'], env)
pp.deferred.addBoth(self._process_finished, slot)
reactor.spawnProcess(pp, sys.executable, args=args, env=env)
self.processes[slot] = pp
def _process_finished(self, _, slot):
process = self.processes.pop(slot)
process.end_time = datetime.now()
self.finished.add(process)
self._wait_for_project(slot)
def _get_max_proc(self, config):
max_proc = config.getint('max_proc', 0)
if not max_proc:
try:
cpus = cpu_count()
except NotImplementedError:
cpus = 1
max_proc = cpus * config.getint('max_proc_per_cpu', 4)
return max_proc
class ScrapyProcessProtocol(protocol.ProcessProtocol):
def __init__(self, slot, project, spider, job, env):
self.slot = slot
self.pid = None
self.project = project
self.spider = spider
self.job = job
self.start_time = datetime.now()
self.end_time = None
self.env = env
self.logfile = env.get('SCRAPY_LOG_FILE')
self.itemsfile = env.get('SCRAPY_FEED_URI')
self.deferred = defer.Deferred()
def outReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stdout" % self.pid)
def errReceived(self, data):
log.msg(data.rstrip(), system="Launcher,%d/stderr" % self.pid)
def connectionMade(self):
self.pid = self.transport.pid
self.log("Process started: ")
def processEnded(self, status):
if isinstance(status.value, error.ProcessDone):
self.log("Process finished: ")
else:
self.log("Process died: exitstatus=%r " % status.value.exitCode)
self.deferred.callback(self)
# on cloud run processEnded not invoked, but processExited
def processExited(self, status):
self.processEnded(status)
def log(self, action):
fmt = '%(action)s project=%(project)r spider=%(spider)r job=%(job)r pid=%(pid)r log=%(log)r items=%(items)r'
log.msg(format=fmt, action=action, project=self.project, spider=self.spider,
job=self.job, pid=self.pid, log=self.logfile, items=self.itemsfile)
finally , you should change the scrapyd.conf file:
launcher ={your launcher module}.Launcher

Wison Wang
- 1
- 1