0

I am trying to download all pdf files from one website but every pdf created is corrupted...

import requests 
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0

for link in links:
    if('.pdf' in link.get('href', [])):
        i += 1
        print("Downloading file: ", i)
        
        response = requests.get(link.get('href'))
        
        pdf = open("pdf"+str(i)+".pdf", 'wb')
        pdf.write(response.content)
        pdf.close()
        print("File ", i, " downloaded")
print("All PDF files downloaded")
LMC
  • 3
  • 1
  • Does this answer your question? [Download and save PDF file with Python requests module](https://stackoverflow.com/questions/34503412/download-and-save-pdf-file-with-python-requests-module) – HedgeHog Nov 29 '21 at 11:04

1 Answers1

0

Add headers to your requests

import requests 
headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; PPC Mac OS X 10_8_7 rv:5.0; en-US) AppleWebKit/533.31.5 (KHTML, like Gecko) Version/4.0 Safari/533.31.5',
}
from bs4 import BeautifulSoup
url ="https://www.geeksforgeeks.org/how-to-extract-pdf-tables-in-python/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
i = 0

for link in links:
    if('.pdf' in link.get('href', [])):
        i += 1
        print("Downloading file: ", i)
        
        response = requests.get(link.get('href'), headers=headers)
        
        pdf = open("pdf"+str(i)+".pdf", 'wb')
        pdf.write(response.content)
        pdf.close()
        print("File ", i, " downloaded")
print("All PDF files downloaded")
EL-AJI Oussama
  • 406
  • 1
  • 4
  • 14