Is this error happening because the page is timing out? Is it related to implicitly_wait
. Any know what's happening as I don't understand
Jul 11 09:34:35 PM 2023-07-12 01:34:35,727 loglevel=ERROR logger=services.extract_link get_link_soup() L158 Failed to save screenshot (png_full) from URL
Jul 11 09:34:35 PM Error message: Message: timeout: Timed out receiving message from renderer: 10.000
Jul 11 09:34:35 PM (Session info: headless chrome=114.0.5735.198)
Jul 11 09:34:35 PM Stacktrace:
Jul 11 09:34:35 PM #0 0x561cabb024e3 <unknown>
Jul 11 09:34:35 PM #1 0x561cab831c76 <unknown>
Jul 11 09:34:35 PM #2 0x561cab81b284 <unknown>
Jul 11 09:34:35 PM #3 0x561cab81afa0 <unknown>
Jul 11 09:34:35 PM #4 0x561cab8199bf <unknown>
Jul 11 09:34:35 PM #5 0x561cab81a162 <unknown>
Jul 11 09:34:35 PM #6 0x561cab83bf9c <unknown>
Jul 11 09:34:35 PM #7 0x561cab8b1bc2 <unknown>
Jul 11 09:34:35 PM #8 0x561cab88d012 <unknown>
Jul 11 09:34:35 PM #9 0x561cab8a530e <unknown>
Jul 11 09:34:35 PM #10 0x561cab88cde3 <unknown>
Jul 11 09:34:35 PM #11 0x561cab8622dd <unknown>
Jul 11 09:34:35 PM #12 0x561cab86334e <unknown>
Jul 11 09:34:35 PM #13 0x561cabac23e4 <unknown>
Jul 11 09:34:35 PM #14 0x561cabac63d7 <unknown>
Jul 11 09:34:35 PM #15 0x561cabad0b20 <unknown>
Jul 11 09:34:35 PM #16 0x561cabac7023 <unknown>
Jul 11 09:34:35 PM #17 0x561caba951aa <unknown>
Jul 11 09:34:35 PM #18 0x561cabaeb6b8 <unknown>
Jul 11 09:34:35 PM #19 0x561cabaeb847 <unknown>
Jul 11 09:34:35 PM #20 0x561cabafb243 <unknown>
Jul 11 09:34:35 PM #21 0x7fc4dd240fa3 start_thread
My code:
def get_link_soup(self, url: str) -> Optional[BeautifulSoup]:
"""
Download the content of a URL and return a BeautifulSoup object for parsing it.
:param url: The URL to download the content of.
:return: A BeautifulSoup object and a screenshot path if the download is successful and the status code is 200, None otherwise.
:raises ValueError: If the URL is invalid (i.e. doesn't have a scheme or network location).
"""
if not self.__is_valid_url(url):
raise ValueError("Invalid URL: protocol must be http or https")
driver = None
soup = None
try:
# Set up the Selenium WebDriver
options = webdriver.ChromeOptions()
options.headless = True
options.add_argument("--headless")
options.add_argument('--no-sandbox')
options.add_argument('--ignore-certificate-errors')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--disable-infobars')
options.add_argument('--user-agent={}'.format(helpers.get_user_agents()))
driver = webdriver.Chrome(options=options)
driver.set_page_load_timeout(90)
# Load the URL and get the page source
driver.implicitly_wait(10)
driver.get(url)
html = driver.page_source
# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html, 'html.parser')
except (selenium.common.exceptions.WebDriverException, Exception) as e:
traceback.print_exc()
logger.error(f"Error downloading content from URL: {url}\nError message: {str(e)}")
return None
# Set filename and created tmp dir if needed
url_hash = UrlBackup.url_to_hash(url)
if not os.path.exists("tmp"):
os.mkdir("tmp")
if not os.path.exists(f"tmp/{url_hash}"):
os.mkdir(f"tmp/{url_hash}")
png_full = f"tmp/{url_hash}/full.png"
png_fold = f"tmp/{url_hash}/fold.png"
# Take a screenshot of the `full` web page
try:
set_width = 1440
set_height = driver.execute_script(
"return Math.max(document.body.scrollHeight, document.documentElement.scrollHeight)"
)
driver.set_window_size(set_width, set_height)
driver.save_screenshot(png_full)
except Exception as e:
logger.error(f"Failed to save screenshot (png_full) from URL: {url}\nError message: {str(e)}")
# Take a screenshot above the `fold`
try:
new_width = 1440
new_height = 900
driver.set_window_size(new_width, new_height)
driver.save_screenshot(png_fold)
except Exception as e:
logger.error(f"Failed to save screenshot (png_fold) from URL: {url}\nError message: {str(e)}")
finally:
if driver is not None:
# Quit the driver to free up resources
driver.quit()
# Return the soup object and the screenshot path
return soup