I am working on an automation project, using Selenium and Requests library, that's scraping a digital library for journals in the form of PDFs. The script successfully downloads the PDF documents but when I try to open them I get the error message shown in the screenshot shown bellow. This might have something to do with authorisations or cookies as the work script works fine when I am connected to my university's Wifi (the digital resource is provided by my uni and I need to log in using my uni credentials). Also, a friend tried it on their machine and it worked fine (my chromedriver is up to date).
The code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import time
import requests as req
def check_need_to_sign_in():
new_tab = driver.window_handles
driver.switch_to.window(str(new_tab[-1]))
url = driver.current_url
i = 0
try:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(
(By.XPATH, "//input[@class='form-control ltr_override input ext-input text-box ext-text-box']")))
print("Sign-in necessary")
sign_in_url = driver.current_url
print(sign_in_url)
for cookie in driver.get_cookies():
print(cookie)
cookie_list = list(map(lambda h: h.get('name')+'='+h.get('value')+'; ', driver.get_cookies()))
cookie_string = ''.join(cookie_list)
print(cookie_string)
headers = {}
headers["Cookie"] = cookie_string
s = req.session()
s.headers.update(headers)
sign_in()
response = s.get(url, verify=False)
while i < 1:
print(response.ok)
if response.ok == True:
with open(f"{article_title[13:]}.pdf", 'wb') as f:
f.write(response.content)
i = + 1
except TimeoutException:
print("No need to sign-in")
url = driver.current_url
response = req.get(url, verify=False)
while i < 1:
print(response.ok)
if response.ok == True:
with open(f"{article_title[13:]}.pdf", 'wb') as f:
f.write(response.content)
i = + 1
def sign_in():
new_tab = driver.window_handles
driver.switch_to.window(str(new_tab[-1]))
email_fill = driver.find_element(By.XPATH, "//input[@type='email']")
email_fill.send_keys("my email")
email_fill.send_keys(Keys.RETURN)
password_fill = driver.find_element(By.XPATH, "//input[@type='password']")
password_fill.send_keys("my password")
time.sleep(6) # necessary sleep
stay_signed_in = driver.find_element(By.XPATH, "//input[@type='password']")
stay_signed_in.send_keys(Keys.RETURN)
stay_signed_in = driver.find_element(By.XPATH, "//input[@type='submit']") # avoid stale element error
time.sleep(3)
stay_signed_in = driver.find_element(By.XPATH, "//input[@type='submit']") # avoid stale element error
ActionChains(driver).move_to_element(stay_signed_in).click(stay_signed_in).perform()
PATH = "/Applications/chromedriver"
ser = Service(PATH)
chromeOptions = webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(service=ser,options=chromeOptions)
driver.maximize_window()
driver.implicitly_wait(20)
driver.get("https://browzine.com/libraries/1374/subjects")
print("Enter targeted Journal name:")
targeted_journal = input()
wait = WebDriverWait(driver, 10)
try:
button = wait.until(EC.element_to_be_clickable(
(By.XPATH, "//input[@class='hero-search ember-text-field ember-view']")))
ActionChains(driver).move_to_element(button).click(button).perform()
button.send_keys(targeted_journal)
button.send_keys(Keys.RETURN)
finally:
pass
timeout = 3
try:
click_journal = driver.find_element(By.XPATH, "//li[@class='result journal first-result ']")
ActionChains(driver).move_to_element(click_journal).click(click_journal).perform()
journal_title = click_journal.find_element(By.XPATH, ".//div[@class='text']").get_attribute("title")
print(journal_title)
parent_tab = driver.current_window_handle
years_available = driver.find_elements(By.XPATH, "//div[@class='year tabindex' or @class='year selected tabindex']")
for year in years_available:
ActionChains(driver).move_to_element(year).click(year).perform()
acting_on_year = year.text
print("acting on the year " + acting_on_year)
issues_container_block = driver.find_element(By.XPATH, "//div[@class='back-issue-items']")
issues_available = issues_container_block.find_elements(By.XPATH, "//div[@class='issue active-override ember-view' or @class='issue ember-view']")
for single_issue in issues_available:
ActionChains(driver).move_to_element(single_issue).click(single_issue).perform()
articles_in_issue = driver.find_elements(By.XPATH, "//section[@class='article-list-item-content-block ']")
for article in articles_in_issue:
article_title = article.get_attribute("aria-label")
check_pdf_button = article.find_element(By.XPATH, ".//span[@class='icon fal fa-file-pdf']")
if len(str(check_pdf_button))>0:
pdf_icon_of_article = article.find_element(By.XPATH,".//span[@class='icon fal fa-file-pdf']")
ActionChains(driver).move_to_element(pdf_icon_of_article).click(pdf_icon_of_article).perform()
check_need_to_sign_in()
driver.close()
driver.switch_to.window(parent_tab)
elif NoSuchElementException:
print("No PDF icon")
pass
continue
continue
finally:
pass
driver.quit()
Adobe Acrobat error message: