Script downloads empty/corrupted PDFs - using requests library

Question

I am working on an automation project, using Selenium and Requests library, that's scraping a digital library for journals in the form of PDFs. The script successfully downloads the PDF documents but when I try to open them I get the error message shown in the screenshot shown bellow. This might have something to do with authorisations or cookies as the work script works fine when I am connected to my university's Wifi (the digital resource is provided by my uni and I need to log in using my uni credentials). Also, a friend tried it on their machine and it worked fine (my chromedriver is up to date).

The code:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
import time
import requests as req


def check_need_to_sign_in():
    new_tab = driver.window_handles
    driver.switch_to.window(str(new_tab[-1]))
    url = driver.current_url
    i = 0
    try:
        WebDriverWait(driver, 5).until(EC.element_to_be_clickable(
            (By.XPATH, "//input[@class='form-control ltr_override input ext-input text-box ext-text-box']")))
        print("Sign-in necessary")
        sign_in_url = driver.current_url
        print(sign_in_url)
        for cookie in driver.get_cookies():
            print(cookie)
        cookie_list = list(map(lambda h: h.get('name')+'='+h.get('value')+'; ', driver.get_cookies()))
        cookie_string = ''.join(cookie_list)
        print(cookie_string)
        headers = {}
        headers["Cookie"] = cookie_string
        s = req.session()
        s.headers.update(headers)
        sign_in()
        response = s.get(url, verify=False)
        while i < 1:
            print(response.ok)
            if response.ok == True:
                with open(f"{article_title[13:]}.pdf", 'wb') as f:
                    f.write(response.content)
                i = + 1
    except TimeoutException:
        print("No need to sign-in")
        url = driver.current_url
        response = req.get(url, verify=False)
        while i < 1:
            print(response.ok)
            if response.ok == True:
                with open(f"{article_title[13:]}.pdf", 'wb') as f:
                    f.write(response.content)
                i = + 1


def sign_in():
    new_tab = driver.window_handles
    driver.switch_to.window(str(new_tab[-1]))
    email_fill = driver.find_element(By.XPATH, "//input[@type='email']")
    email_fill.send_keys("my email")
    email_fill.send_keys(Keys.RETURN)
    password_fill = driver.find_element(By.XPATH, "//input[@type='password']")
    password_fill.send_keys("my password")
    time.sleep(6)  # necessary sleep
    stay_signed_in = driver.find_element(By.XPATH, "//input[@type='password']")
    stay_signed_in.send_keys(Keys.RETURN)
    stay_signed_in = driver.find_element(By.XPATH, "//input[@type='submit']")  # avoid stale element error
    time.sleep(3)
    stay_signed_in = driver.find_element(By.XPATH, "//input[@type='submit']")  # avoid stale element error
    ActionChains(driver).move_to_element(stay_signed_in).click(stay_signed_in).perform()



PATH = "/Applications/chromedriver"
ser = Service(PATH)
chromeOptions = webdriver.ChromeOptions()
prefs = {"plugins.always_open_pdf_externally": True}
chromeOptions.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(service=ser,options=chromeOptions)
driver.maximize_window()
driver.implicitly_wait(20)


driver.get("https://browzine.com/libraries/1374/subjects")
print("Enter targeted Journal name:")
targeted_journal = input()
wait = WebDriverWait(driver, 10)

try:
    button = wait.until(EC.element_to_be_clickable(
        (By.XPATH, "//input[@class='hero-search ember-text-field ember-view']")))
    ActionChains(driver).move_to_element(button).click(button).perform()
    button.send_keys(targeted_journal)
    button.send_keys(Keys.RETURN)
finally:
    pass

timeout = 3
try:
    click_journal = driver.find_element(By.XPATH, "//li[@class='result journal first-result ']")
    ActionChains(driver).move_to_element(click_journal).click(click_journal).perform()
    journal_title = click_journal.find_element(By.XPATH, ".//div[@class='text']").get_attribute("title")
    print(journal_title)
    parent_tab = driver.current_window_handle
    years_available = driver.find_elements(By.XPATH, "//div[@class='year  tabindex' or @class='year selected tabindex']")
    for year in years_available:
        ActionChains(driver).move_to_element(year).click(year).perform()
        acting_on_year = year.text
        print("acting on the year " + acting_on_year)
        issues_container_block = driver.find_element(By.XPATH, "//div[@class='back-issue-items']")
        issues_available = issues_container_block.find_elements(By.XPATH, "//div[@class='issue active-override ember-view' or @class='issue ember-view']")
        for single_issue in issues_available:
            ActionChains(driver).move_to_element(single_issue).click(single_issue).perform()
            articles_in_issue = driver.find_elements(By.XPATH, "//section[@class='article-list-item-content-block ']")
            for article in articles_in_issue:
                article_title = article.get_attribute("aria-label")
                check_pdf_button = article.find_element(By.XPATH, ".//span[@class='icon fal fa-file-pdf']")
                if len(str(check_pdf_button))>0:
                    pdf_icon_of_article = article.find_element(By.XPATH,".//span[@class='icon fal fa-file-pdf']")
                    ActionChains(driver).move_to_element(pdf_icon_of_article).click(pdf_icon_of_article).perform()
                    check_need_to_sign_in()
                    driver.close()
                    driver.switch_to.window(parent_tab)
                elif NoSuchElementException:
                    print("No PDF icon")

                    pass
            continue
        continue
finally:
    pass

driver.quit()

Adobe Acrobat error message:

assuming that everything else is working OK, you're probably quitting the driver/browser before the download has completed. The switchto window handle code is a little odd, though... also careful with driver.close()... with some drivers if that's the last open tab it will also quit the driver. (execute javascript of "close();" instead) Don't rely on index to switch handles. Get current handle, then after a new tab is opened iterate through windowhandles... switch to the one that is not the current. — pcalkins, Dec 02 '21 at 20:45
@pcalkins the script downloads two files for each download for some reason. One of the two files opens fine but doesn't get renamed as I have specified in the request statement (probably what now gets downloaded after I have implemented the wait) and another that does get named as specified but gives the same message in the screenshot (probably the files that got downloaded before). Do you know how to fix this? — double_wizz, Dec 03 '21 at 15:28
some browsers will rename the file after it is fully downloaded. For instance in Chrome it'll be .crdownload... then when it's done renamed to the filename. If you can send the request outside of your Selenium code, that'd be best. Then you'd know when it's complete. There are also other techniques that will poll the files to check for the rename. Search around here at Stackoverflow... there's a few different methods that have been posted. — pcalkins, Dec 03 '21 at 18:32
@pcalkins can you elaborate in [this](https://stackoverflow.com/questions/70274125/selenium-script-downloads-two-files-instead-of-one-using-requests-lib) context? — double_wizz, Dec 08 '21 at 12:04
I don't know python but it looks like a loop problem. Not sure why you're looping at all for a request. — pcalkins, Dec 08 '21 at 18:12
Maybe see this thread?: https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests — pcalkins, Dec 08 '21 at 18:22
@pcalkins the loop is to make sure the get request went through for the specified url. Since it might take some time to load. Why do you think it might be a problem? The files that get downloaded aren't the same. — double_wizz, Dec 08 '21 at 19:23
@pcalkins i have also tried without the loop it still does the same thing. — double_wizz, Dec 08 '21 at 19:26
there's 2 loops, no? (again, I don't know python, so...) Did you eliminate them both? Seems like the request will return a file download header with the payload... why would you loop that? You should be able to parse it to get response code, header, payload after it's been retrieved. (It might just be different in Python, I don't know... in Java you just use URLToFile...) If you send 2 gets one right after the other, the 1st will be aborted. — pcalkins, Dec 08 '21 at 19:30
Seems like the first request should finish before the next though.... doesn't Python wait for the response? — pcalkins, Dec 08 '21 at 19:53

Script downloads empty/corrupted PDFs - using requests library

0 Answers0