I currently have a selenium function which does the following summary of the code:
def (list):
FOR LOOP in list: # Page A (initial), Contains 12
requests,bs4 grabs element coordinates.
[f''string transforms into CSS selector]. # this is the list and loops through this
selenium.driver opens, detect and selects that element
FOR LOOP in [f'string...']: # Page B:, Contains 1
Driver.current url, used to prepare new elements to be detected
requests,bs4 grabs element coordinates. # this is list and loops through this
f''string transforms into CSS selector.
selenium.driver opens, detect and selects that element
download beginds.
sleep for .5 sec
driver goes back to previous page.
Now, my problem is that at predictable iterations, specifically when the for loop B is on its 6/12 element in the list it would crash with the following error code:
'//OBJECT//' is not clickable at point (591, 797). Other element would receive the click: <div style="position: relative" class="cookie-consent-inner">...</div>
(Session info: MicrosoftEdge=...)
Stacktrace:
Backtrace:
...
Now I don't have any problem it doing that but I wish it would continue to PAGE B 7/12 and so on, since it does have the Driver.back(). Instead the application stops.
I tried encasing the entire thing with a try and except: PASS, to capture this error. However, it then begins from Page A and still misses the rest.
I would like a method where I could somehow do a 'continue' statement somewhere, but I've only started learning and I ran out of ideas. You can see in the raw code I tried to do a FOR IF: ERROR statement in hopes to put a pass, but that seems like a syntax error. See the raw code below:
import concurrent.futures
import os
import time
import requests
import re
import selenium.common.exceptions
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
import multiprocessing
edge_driver = 'C:selenium\\webdriver\\edge'
os.environ['PATH'] += edge_driver
web_links = {'digital archive': 'https://digital.nmla.metoffice.gov.uk/SO_1118bfbb-f2c9-476f-aa07-eb58b6db5ce6/', }
def scraping_bot(css_selector):
# First stage: Years
print('FIRST STAGE INITIATED....')
driver = webdriver.Edge()
driver.get(web_links.get('digital archive'))
year_args = (By.CSS_SELECTOR, f'a[href="{css_selector}"]')
driver.find_element(*year_args).click()
# Second Stage: Months
print('SECOND STAGE INITIATED....')
sTWO_url = driver.current_url
sTWO_site = requests.get(sTWO_url)
sTWO_web_objects = BeautifulSoup(sTWO_site.text, 'lxml')
monthly_placeholders = sTWO_web_objects.find(name='div', attrs={'class': 'twelve columns last results'})
months = monthly_placeholders.find_all(name='h5')
month_css_selector = {}
for month_href_tags in months:
month_tag = f'{month_href_tags.get_text()}'
month_hrefs = re.findall(regex, str(month_href_tags))
for month_href in month_hrefs:
month_css_selector.update({month_tag: month_href})
for v, y in zip(month_css_selector.values(), month_css_selector.keys()):
print(v) ##############################
month_args = (By.CSS_SELECTOR, f'a[href="{v}/"]')
driver.find_element(*month_args).click()
# Third Stage: Download
print(f'THIRD STAGE INITIATED for: {y}: {v}')
sTWO_url = driver.current_url
download_site = requests.get(sTWO_url)
content = BeautifulSoup(download_site.text, 'lxml')
nav_controls = content.find_all('nav')
download_button = [nav_controls.find(attrs={'title': 'download'}) for nav_controls in nav_controls]
download_regex = r'(?<=href=\").{1,}(?=\" title)'
for button in download_button:
if button is not None:
print(button) ##############################
downl = re.findall(download_regex, str(button))
if len(downl) == 1:
for downl_button in downl:
download_args = (By.CSS_SELECTOR, f'a[href="{downl_button}"]')
driver.find_element(*download_args).click()
time.sleep(2)
print(f'THIRD STAGE DOWNLOAD COMPLETE: {y}; {v}')
##### END OF TREE HERE ####
driver.back() # goes back to Second Stage and so on
else:
print(f'Your download button matches exceeds 1: {len(downl)}')
if selenium.common.exceptions.ElementClickInterceptedException:
continue
if __name__ == '__main__':
sONE_url = requests.get(web_links.get('digital archive'))
sONE_web_objects = BeautifulSoup(sONE_url.text, 'lxml')
year_placeholder = sONE_web_objects.find(name='div', attrs={'class': 'sixteen columns results-and-filters'})
years = year_placeholder.find_all(name='div', attrs={'class': ['one_sixth grey_block new-secondary-background result-item',
'one_sixth grey_block new-secondary-background result-item last']}) # don't skip, needed for titles.
unit = [years.find('h5') for years in years]
regex = r'(?<=href=\").{1,}(?=\/")' # lookaround = PositiveLookBehind...PositiveLookAhead
year_css_selector = []
titles = [years.get('title') for years in years]
for year_href_tags, year_tag in zip(unit, titles): # href_tag -> bs4 component
hrefs = re.findall(regex, str(year_href_tags.get_text)) # href_tag.get_text -> method that enables str.
for year_href in hrefs:
year_css_selector.append(f'{year_href}/')
for i in year_css_selector:
scraping_bot(i)
Thus, I wish that my expected output would simply pass or continue that skips this erroneous web-page where I can manually download myself.