I am writing a python script that uses selenium to parse each page of basketball stats on ESPN over the last 18 years (each year's stats is its own web page). I am able to connect to the site and parse no problem, however my results are not populating in the terminal while the parsing is occurring. I used a regex checker to make sure the elements I am trying to grab (for now, just the value after "data-idx=" in the html) are correct and they seem to be so I am not too sure what I am doing wrong. Please see code below:
import requests
import pandas as pd
import re
import time
from selenium import webdriver
# Initializing parameters an tools
driver = webdriver.Chrome()
url = "https://www.espn.com/nba/stats/player/_/season/$NUM$/seasontype/2/table/offensive/sort/avgPoints/dir/desc"
# Parsing the starting page to calculate total number of pages
starting_URL = url.replace("$NUM$", str(2002))
print("Starting with:" + starting_URL)
driver.get(starting_URL)
starting_page_content = driver.page_source
# Collecting stats from all pages
for i in range(2001,2020):
page_URL = url.replace("$NUM$", str(i+1))
print("Collecting stats from: " + page_URL)
driver.get(page_URL)
time.sleep(1) # a good practice is to wait a little time between each HTTP request
page_content = driver.page_source # getting HTML source of page i
all_chunks = re.compile(r'Table__TR--sm(.*?)data-idx=\"([^\"]+)\"').findall(page_content) # @UndefinedVariable
if len(all_chunks) > 0: # if found any
for chunk in all_chunks:
#initialization
player_index=""
#parsing index
indexes = re.compile(r'data-idx=\"([^\"]+)\"stack ',re.S|re.I).findall(str(chunk)) # @UndefinedVariable
if(len(indexes) > 0):
player_index = indexes.group(1)[0]
print(player_index) # printing collected data to screen
driver.close()