I have tried many things but cannot seem to get anything to work right. Beautifulsoup does a nice job with straight HTML but the javascript is ignored. I am trying to get the complete HTML after js output with QWebEngineView from PyQt5 to pass to Beautifulsoup. I can get it to display the page correctly in a pop-up window but cannot seem to get the HTML in to a object to pass to Beautifulsoup to parse. I tried this How to get html of a page loaded in QWebEngineView and I am still just getting the unexpanded html without the output from the js.
The goal is to get the list of file paths in a list for later use. Since they are coming from the dynamic js code I can only get them manually from the page and not in my module.
Here is my code so far:
import sys
from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
from bs4 import BeautifulSoup
class Browser(QMainWindow):
htmlFinished = pyqtSignal()
def __init__(self, *args, **kwargs):
QMainWindow.__init__(self, *args, **kwargs)
self.mHtml = ""
self.view = QWebEngineView()
self.setCentralWidget(self.view)
self.view.setUrl(QUrl("https://transparency-in-coverage.uhc.com"))
file_menu = QMenu(self.menuBar())
file_menu.setTitle("File")
save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...",self)
file_menu.addAction(save_file_action)
self.menuBar().addAction(file_menu.menuAction())
save_file_action.triggered.connect(self.save_file)
def callback(self, html):
self.mHtml = html
self.htmlFinished.emit()
def save_file(self):
filename, _ = QFileDialog.getSaveFileName(self, "Save Page As", "", "Hypertext Markup Language (*.htm *.html);;" "All files(*.*)")
if filename:
self.view.page().toHtml(self.callback)
loop = QEventLoop()
self.htmlFinished.connect(loop.quit)
loop.exec_()
with open(filename, 'w') as f:
f.write(self.mHtml)
if __name__ == '__main__':
app = QApplication(sys.argv)
w = Browser()
w.show()
html = w.mHtml
print('page source=' ,html)
soup = BeautifulSoup(html, 'lxml')
for file in soup.find_all('ant-space-item'):
link = file.find('a', href=True)['href']
print(link)
sys.exit(app.exec_())
and the suggested solution:
import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication
def callback_function(html):
print(html)
def on_load_finished():
web.page().runJavaScript(
"document.getElementsByTagName('html')[0].outerHTML", callback_function
)
# or document.getElementsByTagName('html')[0].outerHTML or innerHTML
app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://transparency-in-coverage.uhc.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)
sys.exit(app.exec_())
also returns the unexpanded HTML.