1

Is this error happening because the page is timing out? Is it related to implicitly_wait. Any know what's happening as I don't understand

Jul 11 09:34:35 PM  2023-07-12 01:34:35,727 loglevel=ERROR  logger=services.extract_link get_link_soup() L158  Failed to save screenshot (png_full) from URL
Jul 11 09:34:35 PM  Error message: Message: timeout: Timed out receiving message from renderer: 10.000
Jul 11 09:34:35 PM    (Session info: headless chrome=114.0.5735.198)
Jul 11 09:34:35 PM  Stacktrace:
Jul 11 09:34:35 PM  #0 0x561cabb024e3 <unknown>
Jul 11 09:34:35 PM  #1 0x561cab831c76 <unknown>
Jul 11 09:34:35 PM  #2 0x561cab81b284 <unknown>
Jul 11 09:34:35 PM  #3 0x561cab81afa0 <unknown>
Jul 11 09:34:35 PM  #4 0x561cab8199bf <unknown>
Jul 11 09:34:35 PM  #5 0x561cab81a162 <unknown>
Jul 11 09:34:35 PM  #6 0x561cab83bf9c <unknown>
Jul 11 09:34:35 PM  #7 0x561cab8b1bc2 <unknown>
Jul 11 09:34:35 PM  #8 0x561cab88d012 <unknown>
Jul 11 09:34:35 PM  #9 0x561cab8a530e <unknown>
Jul 11 09:34:35 PM  #10 0x561cab88cde3 <unknown>
Jul 11 09:34:35 PM  #11 0x561cab8622dd <unknown>
Jul 11 09:34:35 PM  #12 0x561cab86334e <unknown>
Jul 11 09:34:35 PM  #13 0x561cabac23e4 <unknown>
Jul 11 09:34:35 PM  #14 0x561cabac63d7 <unknown>
Jul 11 09:34:35 PM  #15 0x561cabad0b20 <unknown>
Jul 11 09:34:35 PM  #16 0x561cabac7023 <unknown>
Jul 11 09:34:35 PM  #17 0x561caba951aa <unknown>
Jul 11 09:34:35 PM  #18 0x561cabaeb6b8 <unknown>
Jul 11 09:34:35 PM  #19 0x561cabaeb847 <unknown>
Jul 11 09:34:35 PM  #20 0x561cabafb243 <unknown>
Jul 11 09:34:35 PM  #21 0x7fc4dd240fa3 start_thread

My code:

    def get_link_soup(self, url: str) -> Optional[BeautifulSoup]:
        """
        Download the content of a URL and return a BeautifulSoup object for parsing it.

        :param url: The URL to download the content of.
        :return: A BeautifulSoup object and a screenshot path if the download is successful and the status code is 200, None otherwise.
        :raises ValueError: If the URL is invalid (i.e. doesn't have a scheme or network location).
        """
        if not self.__is_valid_url(url):
            raise ValueError("Invalid URL: protocol must be http or https")

        driver = None
        soup = None
        try:
            # Set up the Selenium WebDriver
            options = webdriver.ChromeOptions()
            options.headless = True
            options.add_argument("--headless")
            options.add_argument('--no-sandbox')
            options.add_argument('--ignore-certificate-errors')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-extensions')
            options.add_argument('--disable-infobars')
            options.add_argument('--user-agent={}'.format(helpers.get_user_agents()))

            driver = webdriver.Chrome(options=options)
            driver.set_page_load_timeout(90)

            # Load the URL and get the page source
            driver.implicitly_wait(10)
            driver.get(url)
            html = driver.page_source

            # Create a BeautifulSoup object from the HTML
            soup = BeautifulSoup(html, 'html.parser')

        except (selenium.common.exceptions.WebDriverException, Exception) as e:
            traceback.print_exc()
            logger.error(f"Error downloading content from URL: {url}\nError message: {str(e)}")
            return None

        # Set filename and created tmp dir if needed
        url_hash = UrlBackup.url_to_hash(url)
        if not os.path.exists("tmp"):
            os.mkdir("tmp")
        if not os.path.exists(f"tmp/{url_hash}"):
            os.mkdir(f"tmp/{url_hash}")
        png_full = f"tmp/{url_hash}/full.png"
        png_fold = f"tmp/{url_hash}/fold.png"

        # Take a screenshot of the `full` web page
        try:
            set_width = 1440
            set_height = driver.execute_script(
                "return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)"
            )
            driver.set_window_size(set_width, set_height)
            driver.save_screenshot(png_full)
        except Exception as e:
            logger.error(f"Failed to save screenshot (png_full) from URL: {url}\nError message: {str(e)}")

        # Take a screenshot above the `fold`
        try:
            new_width = 1440
            new_height = 900
            driver.set_window_size(new_width, new_height)
            driver.save_screenshot(png_fold)
        except Exception as e:
            logger.error(f"Failed to save screenshot (png_fold) from URL: {url}\nError message: {str(e)}")

        finally:
            if driver is not None:
                # Quit the driver to free up resources
                driver.quit()

        # Return the soup object and the screenshot path
        return soup
Mark
  • 105
  • 1
  • 5

0 Answers0