I have a PyQt5 scraper that has to render a webpage before scraping it, since the webpage has dynamic data. This is the most barebones version of the script, which unfortunately still has several parts.
The only reason the render needs to be called from a function is because sometimes it will freeze up infinitely, so it has a multi threaded timeout on it. Which is all well and good, except the Render won't work properly inside of a function because QApplication isn't properly passed in for some reason. I can define App = QApplication(sys.argv) and put the Render class inside of the ScrapeClockwise function, but that requires defining App within that function as well(It can't be passed in for some reason.) And then if the function times out it'll kick it out without closing QApplication, so the next time the function runs the program will just crash. THIS EVEN HAPPENS IF IT IS DEFINED WITHIN A TRY-EXCEPT STATEMENT, which is extra weird.
As you can see there are a lot of strange interactions here and if anyone could shed some light on any of them I would be incredibly thankful, I've been beating my head against this for a while now.
import sys
from PyQt5.QtCore import *
from PyQt5.QtWebKitWidgets import *
from PyQt5.QtWidgets import *
from bs4 import BeautifulSoup
import threading
import functools
from threading import Thread
def timeout(timeout):
def deco(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
res = [Exception('function [%s] timeout [%s seconds] exceeded!' % (func.__name__, timeout))]
def newFunc():
try:
res[0] = func(*args, **kwargs)
except Exception as e:
res[0] = e
t = Thread(target=newFunc)
t.daemon = True
try:
t.start()
t.join(timeout)
except Exception as je:
print('error starting thread')
raise je
ret = res[0]
if isinstance(ret, BaseException):
raise ret
return ret
return wrapper
return deco
APP = QApplication(sys.argv)
class SomeClass(QWidget):
def some_method(self):
APP.processEvents(QEventLoop.ExcludeUserInputEvents | QEventLoop.ExcludeSocketNotifiers |
QEventLoop.WaitForMoreEvents)
class Render(QWebPage):
def __init__(self, url):
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
APP.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
APP.quit()
def ScrapeClockwise(l):
url = "https://www.clockwisemd.com/hospitals/" + str(l).zfill(4) + "/appointments/new"
print(url)
r = Render(url)
result = r.frame.toHtml()
soup = BeautifulSoup(result, 'html.parser')
info = soup.find_all('h4')
for i in info:
print(i.get_text())
l = 0
while True:
func = timeout(5)(ScrapeClockwise)
try:
func(str(l))
except Exception as e:
print(e)
pass # handle errors here
l += 1