I have written the code to extract the data from the first page, but am running into problems when trying to extract from all pages.
This is my code to extract data from page 'a'
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
from string import ascii_lowercase
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
playerdatasaved = ""
soup = make_soup('https://www.basketball-reference.com/players/a/')
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
print(playerdatasaved)
header = "player, from, to, position, height, weight, dob, year,
colleges"+"\n"
file = open(os.path.expanduser("basketballstats.csv"),"wb")
file.write(bytes(header, encoding = "ascii", errors = "ignore"))
file.write(bytes(playerdatasaved[1:], encoding = "ascii", errors = "ignore"))
Now to loop through pages, my logic is this code
from bs4 import BeautifulSoup
import urllib
import urllib.request
import os
from string import ascii_lowercase
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, 'html.parser')
return soupdata
playerdatasaved = ""
for letter in ascii_lowercase:
soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")
for record in soup.findAll("tr"):
playerdata = ""
for data in record.findAll(["th","td"]):
playerdata = playerdata + "," + data.text
playerdatasaved = playerdatasaved + "\n" + playerdata[1:]
header = "player, from, to, position, height, weight, dob, year,
colleges"+"\n"
file = open(os.path.expanduser("basketball.csv"),"wb")
file.write(bytes(header, encoding = "ascii", errors = "ignore"))
file.write(bytes(playerdatasaved[1:], encoding = "ascii", errors = "ignore"))
However, this is running into an error relating to the line: soup = make_soup("https://www.basketball-reference.com/players/" + letter + "/")