I am trying to loop over a list of URLs using PyQt4 and Beautifulsoup using the following code:
import sys
from bs4 import BeautifulSoup
from PyQt4.QtGui import QApplication
from PyQt4.QtCore import QUrl, pyqtSignal
from PyQt4.QtWebKit import QWebPage
class Render(QWebPage):
def __init__(self, urls, cb):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.urls = urls
self.cb = cb
self.crawl()
self.app.exec_()
def crawl(self):
if self.urls:
url = self.urls.pop(0)
print ('Downloading', url)
self.mainFrame().load(QUrl(url))
else:
self.app.quit()
def _loadFinished(self, result):
frame = self.mainFrame()
url = str(frame.url().toString())
html = frame.toHtml()
self.cb(url, html)
self.crawl()
def scrape(url, html):
pass
soup = BeautifulSoup(unicode(html), "lxml")
t = soup.findAll("div", {"class": "detalhamento_label_valor hidden-print ng-binding"})[0].text
print t
urls = ["http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000005?modulo=0&sistema=portal" ,
"http://apps.mpf.mp.br/aptusmpf/index2#/detalhe/920000000000000000004?modulo=0&sistema=portal" ]
r = Render(urls, cb=scrape)
It seems to work well if the urls are the same [0,1]
, but it gets stuck once the url changes [2]
. I am not really familiar with PyQt4, so I wonder if there is something trivial I might be missing.
EDIT
The program hangs while running the third item of the url list on this operation:
self.mainFrame().load(QUrl(url))
Other than that, the only warning I get is:
libpng warning: iCCP: known incorrect sRGB profile
Though I'm not sure what it means, it does not seem to be connected to the issue.