1

I'm trying to parse a dynamic image's javascript URL link out from a list of URLs. But when I using a for loop it stops at the 3rd iteration or less and returns the results back.

Not sure where it goes wrong. Can anybody here can point out the mistake?

(Note: The QT5 code is not written by me. It's from someone in this forum and I'm testing it out. I want to thank the person but can't find the thread.)

import os
import requests
import bs4 as bs
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
from PyQt5.QtWebEngineWidgets import QWebEnginePage
import sys
import urllib.request

get_all_pages_links_1 = ['http://www.dm5.com/m1103555-p4', 'http://www.dm5.com/m1103555-p5', 'http://www.dm5.com/m1103555-p6', 'http://www.dm5.com/m1103555-p7', 'http://www.dm5.com/m1103555-p8', 'http://www.dm5.com/m1103555-p9', 'http://www.dm5.com/m1103555-p10', 'http://www.dm5.com/m1103555-p11', 'http://www.dm5.com/m1103555-p12', 'http://www.dm5.com/m1103555-p13', 'http://www.dm5.com/m1103555-p14', 'http://www.dm5.com/m1103555-p15', 'http://www.dm5.com/m1103555-p16', 'http://www.dm5.com/m1103555-p17', 'http://www.dm5.com/m1103555-p18', 'http://www.dm5.com/m1103555-p19', 'http://www.dm5.com/m1103555-p20', 'http://www.dm5.com/m1103555-p21', 'http://www.dm5.com/m1103555-p22', 'http://www.dm5.com/m1103555-p23', 'http://www.dm5.com/m1103555-p24', 'http://www.dm5.com/m1103555-p25', 'http://www.dm5.com/m1103555-p26', 'http://www.dm5.com/m1103555-p27', 'http://www.dm5.com/m1103555-p28', 'http://www.dm5.com/m1103555-p29', 'http://www.dm5.com/m1103555-p30', 'http://www.dm5.com/m1103555-p31', 'http://www.dm5.com/m1103555-p32']



##- Methods using Qt5 Class Acting as Broswse to render Javascript links

class Client(QWebEnginePage):

    def __init__(self,url):
        global app
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ""
        self.loadFinished.connect(self.on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print("Load Finished")

    def Callable(self,data):
        self.html = data
        self.app.quit()


dwn_link = []
for item003 in get_all_pages_links_1:
    url03 = item003
    print(url03)

    ## Use Qt5 Class to get Javascript links
    client_response = Client(url03)
    #print(client_response.html)
    soup_javascript = bs.BeautifulSoup(client_response.html, 'html.parser')
    #print(client_response.html)

    ##Use CSS selector method to point to the image link in html
    image_element = soup_javascript.select('#cp_image')
    for image in image_element:
        src_link = image['src']
        dwn_link.append(src_link)
    
    print(dwn_link)
John Kugelman
  • 349,597
  • 67
  • 533
  • 578

0 Answers0