Edit 2
Second approach. For now, I gave up on using multiple instances and configured scrapy settings not to use concurrent requests. It's slow but stable. I opened a bounty. Who can help to make this work concurrently? If I configure scrapy to run concurrently, I get segmentation faults.
class WebkitDownloader( object ):
def __init__(self):
os.environ["DISPLAY"] = ":99"
self.proxyAddress = "a:b@" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
def process_response(self, request, response, spider):
self.request = request
self.response = response
if 'cached' not in response.flags:
webkitBrowser = webkit.WebkitBrowser(proxy = self.proxyAddress, gui=False, timeout=0.5, delay=0.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt'])
#print "added to queue: " + str(self.counter)
webkitBrowser.get(html=response.body, num_retries=0)
html = webkitBrowser.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
response = response.replace(**kwargs)
webkitBrowser.setPage(None)
del webkitBrowser
return response
Edit:
I tried to answer my own question in the meantime and implemented a queue but it does not run asynchronously for some reason. Basically when webkitBrowser.get(html=response.body, num_retries=0)
is busy, scrapy is blocked until the method is finished. New requests are not assigned to the remaining free instances in self.queue
.
Can anyone please point me into right direction to make this work?
class WebkitDownloader( object ):
def __init__(self):
proxyAddress = "http://" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
self.queue = list()
for i in range(8):
self.queue.append(webkit.WebkitBrowser(proxy = proxyAddress, gui=True, timeout=0.5, delay=5.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt']))
def process_response(self, request, response, spider):
i = 0
for webkitBrowser in self.queue:
i += 1
if webkitBrowser.status == "WAITING":
break
webkitBrowser = self.queue[i]
if webkitBrowser.status == "WAITING":
# load webpage
print "added to queue: " + str(i)
webkitBrowser.get(html=response.body, num_retries=0)
webkitBrowser.scrapyResponse = response
while webkitBrowser.status == "PROCESSING":
print "waiting for queue: " + str(i)
if webkitBrowser.status == "DONE":
print "fetched from queue: " + str(i)
#response = webkitBrowser.scrapyResponse
html = webkitBrowser.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
#response = response.replace(**kwargs)
webkitBrowser.status = "WAITING"
return response
I am using WebKit in a scrapy middleware to render JavaScript. Currently, scrapy is configured to process 1 request at a time (no concurrency).
I'd like to use concurrency (e.g. 8 requests at a time) but then I need to make sure that 8 instances of WebkitBrowser()
receive requests based on their individual processing state (a fresh request as soon as WebkitBrowser.get()
is done and ready to receive the next request)
How would I achieve that with Python? This is my current middleware:
class WebkitDownloader( object ):
def __init__(self):
proxyAddress = "http://" + PROXY_DEFAULT_HOST + ":" + str(PROXY_DEFAULT_PORT)
self.w = webkit.WebkitBrowser(proxy = proxyAddress, gui=True, timeout=0.5, delay=0.5, forbidden_extensions=['js','css','swf','pdf','doc','xls','ods','odt'])
def process_response(self, request, response, spider):
if not ".pdf" in response.url:
# load webpage
self.w.get(html=response.body, num_retries=0)
html = self.w.current_html()
respcls = responsetypes.from_args(headers=response.headers, url=response.url)
kwargs = dict(cls=respcls, body=killgremlins(html))
response = response.replace(**kwargs)
return response