I have started to write a programme to Web scrape a table of data from a number of URLs. I have got to the point where I create a list of urls from importing an Excel spreadsheet, loop through the url list and scrape a specific table on the webpage by searching for the table header.
The print statement at the end of the loop prints out the separate table for each url individually. Is there an easy way I can append() the DataFrame rows together similar to an easy list generation? The tables are the same layouts.
from urllib.request import urlopen
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
import pandas as pd
import re
import ssl
import lxml
import xlrd
import csv
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
#auto import excel file
ex = pd.read_excel(r'/Users/adamsewell/Desktop/GB_Basketball/Data/GB_Player_Tracking_Document.xlsm', sheet_name='Player URL')
yr = '2019-20'
#list urls from excel sheet
url_list = ex['URL'].tolist()
for url in url_list:
#first header as a reference point
table_title = 'International Regular Season Stats - Per Game'
#replace to gain second header title to end loop
second_header = (table_title.replace(' Per Game',' Totals'))
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")
#find the table in the whole HTML
start = soup.find('h2', text=table_title)
end = soup.find('h2', text=second_header)
content = '' #prime content as nothing
item = start.nextSibling
#while not at the end header, add content to the item
while item != end:
content += str(item)
item = item.nextSibling
#create a list and concat to a dataframe table
dfs = pd.read_html(content)
df = pd.concat(dfs)
#remove unwanted row (if not year of interest)
indexNames = df[(df['Season'] != yr) & (df['Season'] != yr + ' *')].index
df.drop(indexNames, inplace=True)
#abstract players name from GM URL
name_split = url.split('/')
players_name = (name_split[4].replace('-', ' '))
#Add column of player name, add player name from URL, and move to first column
df['Player Name'] = players_name
col_name = 'Player Name'
first_col = df.pop(col_name)
df.head
df.insert(0,'Player Name', first_col)
print(df)
I am really new to programming and have only started using python about 3 weeks ago so the simpler the answer the better! Thank you