I need to download all the files from this page :
that have "Auction of" on their titles. This is the source for one of the files for example:
<a href="/media/17527/pr090621b.pdf" aria-label="Auction of £2,500 million of 0 5/8% Treasury Gilt 2035, published 09 June 2021">Auction of £2,500 million of 0 5/8% Treasury Gilt 2035</a>
I am trying to adapt some code I found from another question, but the pages are coming back empty:
import os
import re
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
def download_pgn(task):
session, url, destination_path = task
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
game_url = host + soup.find("a", text="download").get("href")
filename = re.search(r"\w+\.pgn", game_url).group()
path = os.path.join(destination_path, filename)
response = session.get(game_url, stream=True)
response.raise_for_status()
with open(path, "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
if __name__ == "__main__":
destination_path = "pgns"
max_workers = 8
if not os.path.exists(destination_path):
os.makedirs(destination_path)
with requests.Session() as session:
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, "lxml")
pages = soup.find_all("a", href=re.compile(r".*Auction of\?.*"))
tasks = [
(session, host + page.get("href"), destination_path)
for page in pages
]
with ThreadPoolExecutor(max_workers=max_workers) as pool:
pool.map(download_pgn, tasks)