I have tried all the solutions from this very similar post but unfortunately, while I do not get any helpful error and neither do I get any pdf files in my folder.
To change the configuration so that selenium works headless and downloads to a directory I want, I followed this post and this.
However I don't see anything. Also the behaviors are different when executing interactively vs when running a script. When executing interactively I don't see any error but then nothing happens as well. When running a script I get a not so useful error:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href*={css_selector}']"))).click()
File "C----\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
The website in question is here.
The code that I am trying to make working is -
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.headless = True
uri = "http://affidavitarchive.nic.in/CANDIDATEAFFIDAVIT.aspx?YEARID=March-2017+(+GEN+)&AC_No=1&st_code=S24&constType=AC"
driver = webdriver.Firefox(options=options, executable_path=r'C:\\Users\\xxx\\geckodriver.exe')
profile = webdriver.FirefoxProfile()
profile.set_preference('browser.download.folderList', 2) # custom location
profile.set_preference('browser.download.manager.showWhenStarting', False)
profile.set_preference('browser.download.dir', r'C:\\Users\\xxx\\Downloads')
profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'application/pdf')
# Function that reads the table in the webpage and extracts the links for the pdfs
def get_links_from_table(uri):
html = requests.get(uri)
soup = BeautifulSoup(html.content, 'lxml')
table = soup.find_all('table')[-1]
candidate_affidavit_links = []
for link in table.find_all('a'):
candidate_affidavit_links.append(link.get('href'))
return candidate_affidavit_links
candidate_affidavit_links_list = get_links_from_table(uri)
driver.get(uri)
# iterate over the javascript links and try to download the pdf files
for js_link in candidate_affidavit_links_list:
css_selector = js_link.split("'")[1]
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"a[href*={css_selector}']"))).click()
driver.execute_script(js_link)