GetPageSource for wx.html2 in Python

Question

I am trying to scrape dynamic information from an HTML site. Since the place that I work from is a bank, I can only use the libraries they have certified as safe. Selenium and PyQt are not an option so I must work with wx.html2.

I found this code snippet:

import wx 
import wx.html2 

class MyBrowser(wx.Dialog): 
  def __init__(self, *args, **kwds): 
    url = "http://www.exampleurl.com"
    wx.Dialog.__init__(self, *args, **kwds) 
    sizer = wx.BoxSizer(wx.VERTICAL) 
    self.browser = wx.html2.WebView.New(self, url=url) 
    sizer.Add(self.browser, 1, wx.EXPAND, 10) 
    self.SetSizer(sizer) 
    self.SetSize((700, 700)) 

if __name__ == '__main__': 
  app = wx.App() 
  dialog = MyBrowser(None, -1) 
  dialog.Show() 
  app.MainLoop()

This allows me to open a web dialog and see the desired content. What I want is to used the wx.html2.WebView GetPageSource() method.

https://wxpython.org/Phoenix/docs/html/wx.html2.WebView.html

I have tried it in the MyBrowser class as:

class MyBrowser(wx.Dialog): 
  def __init__(self, *args, **kwds): 
    url = "http://www.exampleurl.com"
    wx.Dialog.__init__(self, *args, **kwds) 
    sizer = wx.BoxSizer(wx.VERTICAL) 
    self.browser = wx.html2.WebView.New(self, url=url) 
    sizer.Add(self.browser, 1, wx.EXPAND, 10) 
    self.SetSizer(sizer) 
    self.SetSize((700, 700)) 
    print(wx.html2.WebView.GetPageContent(self.browser))

Any idea on how to get this source HTML or an insight on this class would be appreciated. Thanks.

Rolf of Saxony · Answer 1 · 2020-06-05T09:36:27.403

I suspect that without an event to fire the request, your attempts to get the source, where happening too soon i.e. the page hadn't loaded.

The following simply prints the source code of the page.
You'll probably want to display it or dump it to a file, I leave that to you.

After your comment about automation, here is a version of the code that in a quick, hacky way, demonstrates your requirement.

import wx
import wx.html2
import time

class MyBrowser(wx.Dialog):
    def __init__(self, *args, **kwds):
        self.url = ["http://www.stackoverflow.com","http://www.duckduckgo.com"]
        wx.Dialog.__init__(self, *args, **kwds)
        sizer = wx.BoxSizer(wx.VERTICAL)
        self.frame_no = 0
        self.browser = wx.html2.WebView.New(self)#, url=None)
        self.get = wx.Button(self, wx.ID_ANY, "Get Source")
        self.quit = wx.Button(self, wx.ID_ANY, "Quit")
        sizer.Add(self.browser, 1, wx.EXPAND, 10)
        sizer.Add(self.get, 0, wx.ALL, 0)
        sizer.Add(self.quit, 0, wx.ALL, 0)
        self.SetSizer(sizer)
        self.SetSize((700, 700))
        self.browser.Bind(wx.html2.EVT_WEBVIEW_LOADED, self.OnAutoGet)
        self.Bind(wx.EVT_CLOSE, self.OnQuit)
        self.get.Bind(wx.EVT_BUTTON, self.OnGet)
        self.quit.Bind(wx.EVT_BUTTON, self.OnQuit)
        self.browser.LoadURL(self.url[self.frame_no])

    def OnAutoGet(self, event):
        print("Loaded Frame", self.frame_no, event.GetURL())
        time.sleep(2)
        self.OnGet(None)
        self.frame_no += 1
        if self.frame_no < len(self.url):
            self.browser.LoadURL(self.url[self.frame_no])

    def OnGet(self, event):
        print(self.browser.GetPageSource())

    def OnQuit(self, event):
        self.Destroy()

if __name__ == '__main__':
  app = wx.App()
  dialog = MyBrowser(None, -1)
  dialog.Show()
  app.MainLoop()

Thanks Rolf, this does get the desired content but I am trying to do an automation task, so clicking the button won't finish the job. I added just below the self.quite.Bind(...): self.browser.Bind(wx.html2.EVT_WEBVIEW_LOADED, self.OnGet) in order to get the content as soon as the page loads, but what I found out is that the page doesnt seem to fully load until the app.MainLoop() starts, and that stops all next lines of code until the browser closes. What can I do to avoid this? — Kalio O'Farril, Jun 04 '20 at 20:00
The importance of being specific! You made no mention of `I am trying to do an automation task`. The specific question was `What I want is to used the wx.html2.WebView GetPageSource() method.` (sic) — Rolf of Saxony, Jun 05 '20 at 09:33

GetPageSource for wx.html2 in Python

1 Answers1