You're making it a little more complicated than it needs to be. All the data is in there if you look at the <script>
tags. Most cases it's already in a nice json format and just requires a bit of spliting the strings to get the structure. In this particular case, you see though it looks a little different:
<script>
var shotsData = JSON.parse('\x7B\x22h\x22\x3A\x5B\x7B\x22id\x22\x3A\x22271478\x22,\x22minute\x22\x3A\x226\x22,\x22result\x22\x3A\x22MissedShots\x22,\x22....
But not to fear, it still can be worked with using some regex. I also converted the shots data, and roster data from json to a dataframe, but the match data is a single key with all the values so didn't bother with that since it would just be 1 row. You may not even need the dataframe and just work of the json format, but it's all there for you:
import requests
import json
import re
from pandas.io.json import json_normalize
import pandas as pd
response = requests.get('https://understat.com/match/9457')
shotsData = re.search("shotsData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(shotsData.groups()[0], 'utf-8').decode('unicode_escape')
shotsObj = json.loads(decoded_string)
match_info = re.search("match_info\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(match_info.groups()[0], 'utf-8').decode('unicode_escape')
matchObj = json.loads(decoded_string)
rostersData = re.search("rostersData\s+=\s+JSON.parse\('([^']+)", response.text)
decoded_string = bytes(rostersData.groups()[0], 'utf-8').decode('unicode_escape')
rostersObj = json.loads(decoded_string)
# Shots Data into a DataFrame
away_shots_df = json_normalize(shotsObj['a'])
home_shots_df = json_normalize(shotsObj['h'])
shots_df = away_shots_df.append(home_shots_df)
# Rosters Data into a DataFrame
away_rosters_df = pd.DataFrame()
for key, v in rostersObj['a'].items():
temp_df = pd.DataFrame.from_dict([v])
away_rosters_df = away_rosters_df.append(temp_df)
home_rosters_df = pd.DataFrame()
for key, v in rostersObj['h'].items():
temp_df = pd.DataFrame.from_dict([v])
home_rosters_df = home_rosters_df.append(temp_df)
rosters_df = away_rosters_df.append(home_rosters_df)
teams_dict = {'a':matchObj['team_a'], 'h':matchObj['team_h']}
match_title = matchObj['team_h'] + ' vs. ' + matchObj['team_a']
Output:
print (shots_df)
X ... xG
0 0.9069999694824219 ... 0.40696778893470764
1 0.8190000152587891 ... 0.05737118795514107
2 0.94 ... 0.5754774808883667
3 0.9319999694824219 ... 0.02447112277150154
4 0.725 ... 0.02365683950483799
5 0.7759999847412109 ... 0.026968277990818024
6 0.8619999694824219 ... 0.08384699374437332
7 0.7659999847412109 ... 0.013624735176563263
0 0.9269999694824219 ... 0.055443812161684036
1 0.835 ... 0.03609708696603775
2 0.9059999847412109 ... 0.03347432240843773
3 0.9769999694824218 ... 0.07148116827011108
4 0.9869999694824219 ... 0.9712227582931519
5 0.8390000152587891 ... 0.028583310544490814
6 0.8580000305175781 ... 0.07498162239789963
7 0.924000015258789 ... 0.04431726038455963
8 0.9569999694824218 ... 0.48726019263267517
9 0.9540000152587891 ... 0.06847231835126877
10 0.91 ... 0.07779974490404129
11 0.875999984741211 ... 0.04344969615340233
12 0.8780000305175781 ... 0.019344232976436615
13 0.789000015258789 ... 0.043812621384859085
14 0.9419999694824219 ... 0.34188181161880493
15 0.9 ... 0.05839642137289047
16 0.9069999694824219 ... 0.043319668620824814
17 0.8490000152587891 ... 0.058181893080472946
18 0.9019999694824219 ... 0.09132817387580872
19 0.87 ... 0.11395697295665741
20 0.8819999694824219 ... 0.035116128623485565
[29 rows x 20 columns]
ADDITIONAL
As suspected, the Timing Chart is generated by the 'xG
' column in the shotsData
. It's merely a running sum of the xP for each team. I also provide the line chart at the end, where you can hover over the graph. Check out plotly. I have used it before and it's great, however, beyond the scope of the question. But here is a quick one I did:
Timing Chart
#########################################################################
# Timing Chart is an aggregation (running sum) of xG from the shotsData
#########################################################################
import numpy as np
# Convert 'minute' astype int and sort the dataframe by 'minute'
shots_df['minute'] = shots_df['minute'].astype(int)
shots_df['xG'] = shots_df['xG'].astype(float)
timing_chart_df = shots_df[['h_a', 'minute', 'xG']].sort_values('minute')
timing_chart_df['h_a'] = timing_chart_df['h_a'].map(teams_dict)
# Get max value of the 'minute' column to interpolate minute interval between that range
max_value = timing_chart_df['minute'].max()
# Aggregate xG within the same minute
timing_chart_df = timing_chart_df.groupby(['h_a','minute'], as_index=False)['xG'].sum()
# Interpolate for each team/group
min_idx = np.arange(timing_chart_df['minute'].max() + 1)
m_idx = pd.MultiIndex.from_product([timing_chart_df['h_a'].unique(), min_idx], names=['h_a', 'minute'])
# Calculate the running sum
timing_chart_df = timing_chart_df.set_index(['h_a', 'minute']).reindex(m_idx, fill_value=0).reset_index()
timing_chart_df['running_sum_xG'] = timing_chart_df.groupby('h_a')['xG'].cumsum()
timing_chart_T_df = timing_chart_df.pivot(index='h_a', columns='minute', values='running_sum_xG')
timing_chart_T_df = timing_chart_T_df.reset_index().rename(columns={timing_chart_T_df.index.name:match_title})
Output:
print (timing_chart_T_df.to_string())
minute West Ham vs. Fulham 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
0 Fulham 0.406968 0.464339 1.039816 1.039816 1.039816 1.039816 1.039816 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.064288 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.087944 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.114913 1.198760 1.198760 1.198760 1.19876 1.19876 1.198760 1.198760 1.198760 1.198760 1.212384
1 West Ham 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.055444 0.091541 0.091541 0.091541 0.091541 0.091541 0.091541 1.167719 1.167719 1.196302 1.196302 1.196302 1.196302 1.271284 1.271284 1.315601 1.315601 1.315601 1.802862 1.802862 1.871334 1.949134 1.949134 1.992583 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.011928 2.055740 2.055740 2.055740 2.397622 2.397622 2.397622 2.397622 2.397622 2.397622 2.397622 2.456018 2.499338 2.55752 2.55752 2.648848 2.762805 2.797921 2.797921 2.797921
Plotly Line Chart:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='username', api_key='xxxxxxxxxxx')
plotly.tools.set_config_file(world_readable=True)
# Create traces
trace0 = go.Scatter(
x = timing_chart_df[timing_chart_df['h_a'] == 'a']['minute'],
y = timing_chart_df[timing_chart_df['h_a'] == 'a']['running_sum_xG'],
mode = 'lines',
name = 'Fulham',
line = dict(
color = ('#E5E64B'),
width = 4)
)
trace1 = go.Scatter(
x = timing_chart_df[timing_chart_df['h_a'] == 'h']['minute'],
y = timing_chart_df[timing_chart_df['h_a'] == 'h']['running_sum_xG'],
mode = 'lines',
name = 'West Ham',
line = dict(
color = ('#00BCD4'),
width = 4)
)
data_comp = [trace0, trace1]
layout_comp = go.Layout(
autosize=False,
width=800,
height=600,
title='Timing Chart',
plot_bgcolor='#3E3E40',
hovermode='x',
xaxis=dict(
title='Minute',
ticklen=15,
zeroline=True,
showgrid=True,
gridcolor='#39393B',
gridwidth=2,
),
yaxis=dict(
title='xG',
ticklen=5,
gridwidth=2,
zeroline=True,
showgrid=True,
gridcolor='#39393B',
),
)
fig_comp = go.Figure(data=data_comp, layout=layout_comp)
py.iplot(fig_comp, filename='line-mode')