0

I have tried many things but cannot seem to get anything to work right. Beautifulsoup does a nice job with straight HTML but the javascript is ignored. I am trying to get the complete HTML after js output with QWebEngineView from PyQt5 to pass to Beautifulsoup. I can get it to display the page correctly in a pop-up window but cannot seem to get the HTML in to a object to pass to Beautifulsoup to parse. I tried this How to get html of a page loaded in QWebEngineView and I am still just getting the unexpanded html without the output from the js.

The goal is to get the list of file paths in a list for later use. Since they are coming from the dynamic js code I can only get them manually from the page and not in my module.

Here is my code so far:


import sys

from PyQt5.QtCore import *
from PyQt5.QtGui import *
from PyQt5.QtWidgets import *
from PyQt5.QtWebEngineWidgets import *
from bs4 import BeautifulSoup

class Browser(QMainWindow):
    htmlFinished = pyqtSignal()
    def __init__(self, *args, **kwargs):
        QMainWindow.__init__(self, *args, **kwargs)
        self.mHtml = ""
        self.view = QWebEngineView()
        self.setCentralWidget(self.view)
        self.view.setUrl(QUrl("https://transparency-in-coverage.uhc.com"))
        file_menu = QMenu(self.menuBar())
        file_menu.setTitle("File")
        save_file_action = QAction(QIcon("disk--pencil.png"), "Save Page As...",self)
        file_menu.addAction(save_file_action)
        self.menuBar().addAction(file_menu.menuAction())
        save_file_action.triggered.connect(self.save_file)

    def callback(self, html):
        self.mHtml = html
        self.htmlFinished.emit()

    def save_file(self):
        filename, _ = QFileDialog.getSaveFileName(self, "Save Page As", "", "Hypertext Markup Language (*.htm *.html);;" "All files(*.*)")
        if filename:
            self.view.page().toHtml(self.callback)
            loop = QEventLoop()
            self.htmlFinished.connect(loop.quit)
            loop.exec_()
            with open(filename, 'w') as f:
                f.write(self.mHtml)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    w = Browser()
    w.show()

    html = w.mHtml

    print('page source=' ,html)

    soup = BeautifulSoup(html, 'lxml')

    for file in soup.find_all('ant-space-item'):
        link = file.find('a', href=True)['href']
        print(link)

    sys.exit(app.exec_())

and the suggested solution:

import sys
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5.QtWidgets import QApplication


def callback_function(html):
    print(html)


def on_load_finished():
    web.page().runJavaScript(
        "document.getElementsByTagName('html')[0].outerHTML", callback_function
    )
    # or document.getElementsByTagName('html')[0].outerHTML or innerHTML


app = QApplication(sys.argv)
web = QWebEngineView()
web.load(QUrl("https://transparency-in-coverage.uhc.com"))
web.show()
web.resize(640, 480)
web.loadFinished.connect(on_load_finished)

sys.exit(app.exec_())

also returns the unexpanded HTML.

JoJoB
  • 1
  • 1

0 Answers0