Scraping PDFs from multiple pages using bs4

Question

I'm a python beginner and I'm hoping that what I'm trying to do isn't too involved. Essentially, I want to extract the text of the minutes (contained in PDF documents) from this municipality's council meetings for the last ~10 years at this website: https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3

Eventually, I want to analyze/categorise the action items from the meeting minutes. All I've been able to do so far is grab the links leading to the PDFs from the first page. Here is my code:

# Import requests for navigating to websites, beautiful soup to scrape website, PyPDF2 for PDF data mining
 
import sys 
import requests
import bs4 
import PyPDF2 
#import PDfMiner 
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup 

# Soupify URL
my_url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
result = requests.get(my_url)
src = result.content
page_soup = soup(src, "lxml")

#list with links
urls = []
for tr_tag in page_soup.find_all("tr"):
    a_tag = tr_tag.find("a")
    urls.append(a_tag.attrs["href"])

print(urls)

A few things I could use help with:

How do I pull the links from pages 1 - 50 (arbitrary in the 'Previous Meetings' site, instead of just the first page?
How do I go about entering each of the links, and pulling the 'Read the minutes' PDFs for text analysis (using PyPDF2?)

Any help is so appreciated! Thank you in advance!

EDIT: I am hoping to get the data into a dataframe, where the first column is the file name and the second column is the text from the PDF. It would look like:

PDF_file_name	PDF_text
spec20210729min	[[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nJULY 29, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw
spec20210802min	[[' \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING MINUTES\n \n \nAUGUST 2, 2021\n \n \nA Special Meeting of the Council\n \nof the City of Vancouver\n \nw

JimZer · Accepted Answer · 2021-08-18T08:11:15.773

Welcome to the exciting world of web scraping!

First of all, great job you were on the good track. There are a few points to discuss though.

You essentially have 2 problems here.

1 - How to retrieve the HTML text for all pages (1, ..., 50)?

In web scraping you have mainly to kind of web pages:

If you are lucky, the page does not render using javascript and you can use only requests to get the page content
You are less lucky, and the page uses JavaScript to render partly or entirely

To get all the pages from 1 to 50, we need to somehow click on the button next at the end of the page. Why? If you check what happens in the network tab from the browser developer, console, you see that a new query getting a JS script to generate the page is fetched for each click to the next button. Unfortunately, we can't render JavaScript using requests

But we have a solution: Headless Browsers (wiki).

In the solution, I use selenium, which is a library that can use a real browser driver (in our case Chrome) to query a page and render JavaScript.

So we first get the web page with selenium, we extract the HTML, we click on next and wait a bit for the page to load, we extract the HTML, ... and so on.

2 - How to extract the text from the PDFs after getting them?

After downloading the PDfs, we can load it into a variable then open it with PyPDF2 and extract the text from all pages. I let you look at the solution code.

Here is a working solution. It will iterate over the first n pages you want and return the text from all the PDF you are interested in:

import os
import time
from io import BytesIO
from urllib.parse import urljoin

import pandas as pd
import PyPDF2
import requests
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Create a headless chromedriver to query and perform action on webpages like a browser
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

# Main url
my_url = (
    "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"
)


def get_n_first_pages(n: int):
    """Get the html text for the first n pages

    Args:
        n (int): The number of pages we want

    Returns:
        List[str]: A list of html text
    """

    # Initialize the variables containing the pages
    pages = []

    # We query the web page with our chrome driver.
    # This way we can iteratively click on the next link to get all the pages we want
    driver.get(my_url)
    # We append the page source code
    pages.append(driver.page_source)

    # Then for all subsequent pages, we click on next and wait to get the page
    for _ in range(1, n):
        driver.find_element_by_css_selector(
            "#LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28"
        ).click()
        # Wait for the page to load
        time.sleep(1)
        # Append the page
        pages.append(driver.page_source)
    return pages


def get_pdf(link: str):
    """Get the pdf text, per PDF pages, for a given link.

    Args:
        link (str): The link where we can retrieve the PDF

    Returns:
        List[str]: A list containing a string per PDF pages
    """

    # We extract the file name
    pdf_name = link.split("/")[-1].split(".")[0]

    # We get the page containing the PDF link
    # Here we don't need the chrome driver since we don't have to click on the link
    # We can just get the PDF using requests after finding the href
    pdf_link_page = requests.get(link)
    page_soup = soup(pdf_link_page.text, "lxml")
    # We get all <a> tag that have href attribute, then we select only the href
    # containing min.pdf, since we only want the PDF for the minutes
    pdf_link = [
        urljoin(link, l.attrs["href"])
        for l in page_soup.find_all("a", {"href": True})
        if "min.pdf" in l.attrs["href"]
    ]
    # There is only one PDF for the minutes so we get the only element in the list
    pdf_link = pdf_link[0]

    # We get the PDF with requests and then get the PDF bytes
    pdf_bytes = requests.get(pdf_link).content
    # We load the bytes into an in memory file (to avoid saving the PDF on disk)
    p = BytesIO(pdf_bytes)
    p.seek(0, os.SEEK_END)

    # Now we can load our PDF in PyPDF2 from memory
    read_pdf = PyPDF2.PdfFileReader(p)
    count = read_pdf.numPages
    pages_txt = []
    # For each page we extract the text
    for i in range(count):
        page = read_pdf.getPage(i)
        pages_txt.append(page.extractText())

    # We return the PDF name as well as the text inside each pages
    return pdf_name, pages_txt


# Get the first 2 pages, you can change this number
pages = get_n_first_pages(2)


# Initialize a list to store each dataframe rows
df_rows = []

# We iterate over each page
for page in pages:
    page_soup = soup(page, "lxml")

    # Here we get only the <a> tag inside the tbody and each tr
    # We avoid getting the links from the head of the table
    all_links = page_soup.select("tbody tr a")
    # We extract the href for only the links containing council (we don't care about the
    # video link)
    minutes_links = [x.attrs["href"] for x in all_links if "council" in x.attrs["href"]]

    #
    for link in minutes_links:
        pdf_name, pages_text = get_pdf(link)

        df_rows.append(
            {
                "PDF_file_name": pdf_name,
                # We join each page in the list into one string, separting them with a line return
                "PDF_text": "\n".join(pages_text),
            }
        )

        break
    break

# We create the data frame from the list of rows
df = pd.DataFrame(df_rows)

Outputs a dataframe like:

        PDF_file_name                                           PDF_text
    0  spec20210729ag   \n \n \n \n \n \n \nSPECIAL COUNCIL MEET\nING...
...

Keep scraping the web, it's fun :)

Wow, thank you infinitely! I'm getting an error related to the webdriver.. any idea what this might pertain to? Traceback (most recent call last): File "C:\Users\m\Anaconda3\lib\site-packages\selenium\webdriver\common\service.py", line 72, in start self.process = subprocess.Popen(cmd, env=self.env, File "C:\Users\m\Anaconda3\lib\subprocess.py", line 858, in __init__ self._execute_child(args, executable, preexec_fn, close_fds, FileNotFoundError: [WinError 2] The system cannot find the file specified — scotiaboy, Aug 14 '21 at 00:13
My pleasure :) Regarding the error you have, you might not have the chromedriver installed, or not set in the default path. You can install it and specify the location as follows: `driver = webdriver.Chrome(chrome_options=options, executable_path="path/to/chromedriver/executable")` with the proper path. On Ubuntu, I think you can install it with `sudo apt-get install chromium-chromedriver `. What is your OS? — JimZer, Aug 14 '21 at 00:20
I'm on Windows 10, but it looks like I might need admin permissions to download chromedriver on my computer! I will try to get it downloaded and specify the path :) — scotiaboy, Aug 14 '21 at 00:41
If you are on Windows 10, this StackOverflow post might be helpful: https://stackoverflow.com/a/34522424/4875236 Hope you will get it to work. — JimZer, Aug 14 '21 at 09:44
This worked great, thank you so much again. I'm wondering - is there anyway to preserve the file name from each pdf? ie read the file name into one column and the PDF text into a second column for each file? — scotiaboy, Aug 17 '21 at 20:49
You're welcome. I'm glad it helped you :) I'm not sure to get what you mean by getting the PDf file name in one column and the text into another column. Can you provide a small example of what you want to achieve? — JimZer, Aug 17 '21 at 21:41
Absolutely. Basically, I'm hoping to transfer the files into a data frame where one column is the file name and the other column is the text (as a string) from each PDF. I edited the original post to visualize this.. I hope that helps! — scotiaboy, Aug 18 '21 at 00:12
Ah great, it's clear now. I edited the solution to do exactly what you want. We now extract the pdf name and use it to construct a Dataframe. You can try it. Let me know if you have questions. — JimZer, Aug 18 '21 at 08:12
Hello again @JimZer! Thank you so much for this. This does exactly what I was hoping, but for some reason the dataframe only shows the first PDF text file. I can't seem to get it to add subsequent rows for each file. Any idea what might be happening? — scotiaboy, Aug 26 '21 at 18:51

score 0 · Answer 2 · answered Aug 14 '21 at 16:02

The issue is that BeautifulSoup won't see any results besides those for the first page. BeautifulSoup is just an XML/HTML parser, it's not a headless browser or JavaScript-capable runtime environment that can run JavaScript asynchronously. When you make a simple HTTP GET request to your page, the response is an HTML document, in which the first page's results are directly baked into the HTML. These contents are baked into the document at the time the server served the document to you, so BeautifulSoup can see these elements no problem. All the other pages of results, however, are more tricky.

View the page in a browser. While logging your network traffic, click on the "next" button to view the next page's results. If you're filtering your traffic by XHR/Fetch requests only, you'll notice an HTTP POST request being made to an ASP.NET server, the response of which is HTML containing JavaScript containing JSON containing HTML. It's this nested HTML structure that represents the new content with which to update the table. Clicking this button doesn't actually take you to a different URL - the contents of the table simply change. The DOM is being updated/populated asynchronously using JavaScript, which is not uncommon.

The challenge, then, is to mimic these requests and parse the response to extract the HREFs of only those links in which you're interested. I would split this up into three distinct scripts:

One script to generate a .txt file of all sub-page URLs (these would be the URLs you navigate to when clicking links like "Agenda and Minutes", example)
One script to read from that .txt file, make requests to each URL, and extract the HREF to the PDF on that page (if one is available). These direct URLs to PDFs will be saved in another .txt file.
A script to read from the PDF-URL .txt file, and perform PDF analysis.

You could combine scripts one and two if you really want to. I felt like splitting it up.

The first script makes an initial request to the main page to get some necessary cookies, and to extract a hidden input __OSVSTATE that's baked into the HTML which the ASP.NET server cares about in our future requests. It then simulates "clicks" on the "next" button by sending HTTP POST requests to a specific ASP.NET server endpoint. We keep going until we can't find a "next" button on the page anymore. It turns out there are around ~260 pages of results in total. For each of these 260 responses, we parse the response, pull the HTML out of it, and extract the HREFs. We only keep those tags whose HREF ends with the substring ".htm", and whose text contains the substring "minute" (case-insensitive). We then write all HREFs to a text file page_urls.txt. Some of these will be duplicated for some reason, and other's end up being invalid links, but we'll worry about that later. Here's the entire generated text file.

def get_urls():
    import requests
    from bs4 import BeautifulSoup as Soup
    import datetime
    import re
    import json

    # Start by making the initial request to store the necessary cookies in a session
    # Also, retrieve the __OSVSTATE

    url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx?SearchType=3"

    headers = {
        "user-agent": "Mozilla/5.0"
    }

    session = requests.Session()

    response = session.get(url, headers=headers)
    response.raise_for_status()

    soup = Soup(response.content, "html.parser")

    osv_state = soup.select_one("input[id=\"__OSVSTATE\"]")["value"]

    # Get all results from all pages

    url = "https://covapp.vancouver.ca/councilMeetingPublic/CouncilMeetings.aspx"

    headers = {
        "user-agent": "Mozilla/5.0",
        "x-requested-with": "XMLHttpRequest"
    }

    payload = {
        "__EVENTTARGET": "LiverpoolTheme_wt93$block$wtMainContent$RichWidgets_wt132$block$wt28",
        "__AJAX": "980,867,LiverpoolTheme_wt93_block_wtMainContent_RichWidgets_wt132_block_wt28,745,882,0,277,914,760,"
    }

    while True:
        params = {
            "_ts": round(datetime.datetime.now().timestamp())
        }
        payload["__OSVSTATE"] = osv_state

        response = session.post(url, params=params, headers=headers, data=payload)
        response.raise_for_status()

        pattern = "OsJSONUpdate\\(({\"outers\":{[^\\n]+})\\)//\\]\\]"

        jsn = re.search(pattern, response.text).group(1)
        data = json.loads(jsn)

        osv_state = data["hidden"]["__OSVSTATE"]

        html = data["outers"]["LiverpoolTheme_wt93_block_wtMainContent_wtTblCommEventTable_Wrapper"]["inner"]

        soup = Soup(html, "html.parser")

        # Select only those a-tags whose href attribute ends with ".htm" and whose text contains the substring "minute"
        tags = soup.select("a[href$=\".htm\"]")

        hrefs = [tag["href"] for tag in tags if "minute" in tag.get_text().casefold()]

        yield from hrefs

        page_num = soup.select_one("a.ListNavigation_PageNumber").get_text()
        records_message = soup.select_one("div.Counter_Message").get_text()

        print("Page #{}:\n\tProcessed {}, collected {} URL(s)\n".format(page_num, records_message, len(hrefs)))

        if soup.select_one("a.ListNavigation_Next") is None:
            break


def main():
    with open("page_urls.txt", "w") as file:
        for url in get_urls():
            file.write(url + "\n")
    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

The second script reads the output file of the previous one, and makes a request to each URL in the file. Some of these will be invalid, some need to be cleaned up in order to be used, many will be duplicates, some will be valid but won't contain a link to a PDF, etc. We visit each page and extract the PDF URL, and save each in a file. In the end I've managed to collect 287 usable PDF URLs. Here is the generated text file.

def get_pdf_url(url):
    import requests
    from bs4 import BeautifulSoup as Soup

    url = url.replace("/ctyclerk", "")

    base_url = url[:url.rfind("/")+1]

    headers = {
        "user-agent": "Mozilla/5.0"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        return ""

    soup = Soup(response.content, "html.parser")

    pdf_tags = soup.select("a[href$=\".pdf\"]")

    tag = next((tag for tag in pdf_tags if "minute" in tag.get_text()), None)

    if tag is None:
        return ""

    return tag["href"] if tag["href"].startswith("http") else base_url + tag["href"]



def main():

    with open("page_urls.txt", "r") as file:
        page_urls = set(file.read().splitlines())

    with open("pdf_urls.txt", "w") as file:
        for count, pdf_url in enumerate(map(get_pdf_url, page_urls), start=1):
            if pdf_url:
                status = "Success"
                file.write(pdf_url + "\n")
                file.flush()
            else:
                status = "Skipped"
                
            print("{}/{} - {}".format(count, len(page_urls), status))

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

The third script would read from the pdf_urls.txt file, make a request to each URL, and then interpret the response bytes as a PDF:

def main():

    import requests
    from io import BytesIO
    from PyPDF2 import PdfFileReader

    with open("pdf_urls.txt", "r") as file:
        pdf_urls = file.read().splitlines()

    for pdf_url in pdf_urls:
        response = requests.get(pdf_url)
        response.raise_for_status()

        content = BytesIO(response.content)

        reader = PdfFileReader(content)
        # do stuff with reader

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

Scraping PDFs from multiple pages using bs4

2 Answers2