1

I'm making a simple WebScraping that download the image of the items of some champions of a site, I put a "for" with 5 characters and it only executes 2 of them and then closes without giving any error!

import bs4 as bs
import sys,os
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl

class Page(QWebEnginePage):
    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        print("#1 __init__")
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('#2 On Load finished')

    def Callable(self, html_str):
        print("#3 Callable\n")
        self.html = html_str
        self.app.quit()

def already_exist(image_name):
    for _, _, folder in os.walk('Images'):
        if image_name in folder:
            return False
        else:
            return True

def ImageDownload(url):
    image_name = url.split("/")
    try:
        if already_exist(image_name[-1]):
            full_path = "Images/" + image_name[-1]
            urllib.request.urlretrieve(url, full_path)
            print("Download %s" % image_name)
        else:
            print("Image already Downloaded >: %s" % image_name[-1])
    except:
        print("Error Download")

def main():
    champions = ['Amumu','Akali','Zed','Nunu'] #champions
    for champ in champions:
        try:
            print("\nDownloading Images >: %s"% champ)
            data = Page('https://www.probuilds.net/champions/details/%s' % champ.strip())
            soup = bs.BeautifulSoup(data.html, 'html.parser')
            items = soup.find_all('div',{'class':'items'})
            for photos in items:
                images = photos.find_all('img')
                for image in images:
                    ImageDownload(image['src'])
        except:
            print("Shi...")

main()

i'm getting no error but the program only executes 2 times this is the problem, someone help me !!!

1 Answers1

0

What it seems is that the QWebEnginePage does not close correctly, it is also advisable to reuse instead of creating another QWebEnginePage, so using an old answer as a basis I have implemented the following solution:

import os
import sys
import bs4 as bs
import urllib.request
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets

class WebPage(QtWebEngineWidgets.QWebEnginePage):
    def __init__(self):
        super(WebPage, self).__init__()
        self.loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QtCore.QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.process(self.url(), html)
        if not self.fetchNext():
            QtWidgets.qApp.quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def process(self, url, html):
        print('loaded: [%d chars] %s' % (len(html), url.toString()))

class ScrapePage(WebPage):
    def __init__(self):
        super(ScrapePage, self).__init__()
        self.results = set()

    def process(self, url, html):
        soup = bs.BeautifulSoup(html, 'html.parser')
        items = soup.find_all('div',{'class':'items'})
        for photos in items:
            images = photos.find_all('img')
            for image in images:
                self.results.add(image['src'])

def already_exist(image_name):
    for _, _, folder in os.walk('Images'):
        if image_name in folder:
            return False
        else:
            return True

def ImageDownload(url):
    image_name = url.split("/")
    try:
        if already_exist(image_name[-1]):
            full_path = "Images/" + image_name[-1]
            urllib.request.urlretrieve(url, full_path)
            print("Download %s" % image_name)
        else:
            print("Image already Downloaded >: %s" % image_name[-1])
    except:
        print("Error Download")

if __name__ == '__main__':

    app = QtWidgets.QApplication(sys.argv)
    webpage = ScrapePage()

    champions = ['Amumu','Akali','Zed','Nunu']
    base_url = 'https://www.probuilds.net/champions/details/'

    urls = []
    for champ in champions:
        url = QtCore.QUrl(base_url).resolved(QtCore.QUrl(champ))
        urls.append(url)
    webpage.start(urls)
    app.exec_()
    for url in webpage.results:
        ImageDownload(url)
eyllanesc
  • 235,170
  • 19
  • 170
  • 241
  • @TutiTutors Can you be specific? What part of the code do you not understand? – eyllanesc Nov 10 '18 at 04:12
  • class "WebPage" ! – Tuti Tutors Nov 10 '18 at 04:51
  • @TutiTutors you have to be more specific, practically the complete solution is that class, but good in general the idea is to create an iterator with the links, so when you get the .html fetchNext() is called to get the new url using the iterator and load the page. If you have a specific question it would be great. – eyllanesc Nov 10 '18 at 04:55
  • I made an edit in the post and it is now working, can you take a look at the code and say your opinion? – Tuti Tutors Nov 11 '18 at 19:12
  • @TutiTutors please do not add code modifications to my answer, the editions are useless, the edits serve to improve the grammar, the syntax, to update broken links, etc., but not to add code. My code works and it is my solution, you could indicate in your question the modification you made to my answer to get your solution. – eyllanesc Nov 11 '18 at 19:16