I'm trying to get the data from all the pages on this URL: https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf The URL does not change when you click on the next page. I googled how to do it, but didn't find a solution/didn't understand it. I don't understand how to make it loop through all the pages and get the data.
My current code (it only grips the data from the first page):
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://ocrportal.hhs.gov/ocr/breach/breach_report.jsf'
def getdata(your_url):
page = requests.get(your_url)
soup = BeautifulSoup(page.text, 'lxml')
table1 = soup.find('div', id='ocrForm:reportResultTable')
headers = []
for i in table1.find_all('th'):
title = i.text
headers.append(title)
df = pd.DataFrame(columns = headers)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(df)
df.loc[length] = row
df = df.drop(['Expand All'], axis=1)
return df
df = getdata(your_url = url)
df