Suppose you have a pandas.DataFrame
like this one:
import pandas as pd
import numpy as np
from math import exp, log10
df = pd.DataFrame({'S' : ['C', 'A', 'A', 'B', 'B', 'A'],
'ID': ['ID3', 'ID1', 'ID2', 'ID4', 'ID4', 'ID1'],
'M' : ['Y', 'X', 'X', 'Y', 'Z', 'X'],
'V' : ['<1', '<0.5', 3, '>10', 7, 6]
})
df
# S ID M V
#0 C ID3 Y <1
#1 A ID1 X <0.5
#2 A ID2 X 3
#3 B ID4 Y >10
#4 B ID4 Z 7
#5 A ID1 X 6
and a dictionary that maps each M
to a function of x, like:
M_to_transf_dict = {'X' : 'x', 'Y' : 'exp(x)', 'Z' : 'log10(x)'}
This is what you need to do:
- split
V
into the numerical part (V_u
) and qualifier (V_c
: '<', '>', if not already numerical or interpretable as such) - group by
S, M
, and for each group:- calculate the mean
V_mean
, countV_N
and sample standard deviationV_sample_sd
ofV_u
- make the comma-separated list of unique
ID
's and store the result intoID
- find the most frequent '<' or '>' qualifier in
V_c
and store the result intoV_mean_c
- apply the function of x corresponding to each
M
toV_mean
and store the result intoTV_mean
- apply
V_mean_c
onV_mean
andTV_mean
, when '<' or '>'
- calculate the mean
After many trials, I put together something that seems to work, but I have doubts regarding performance.
I saw some posts (e.g. this one) questioning the use of .apply
, and indeed from some tests it appears that handling the splitting of V
as an external map followed by column assignment is much faster than .apply
:
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
%timeit df[['V_u', 'V_c']] = df.apply(lambda row : unc(row['V']), axis = 1, result_type = 'expand')
# 698 µs ± 5.22 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
%%timeit
uc = df['V'].map(unc)
u = [uci[0] for uci in uc.values]
c = [uci[1] for uci in uc.values]
df['V_u'] = u
df['V_c'] = c
# 129 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
%%timeit
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
# 124 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
(Note that I remade df
before each timeit
).
For the other operations I described, except the last two, I used .groupby.agg
with named aggregation:
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df_summary_by_S_M = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
# 4.17 ms ± 61.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd
#0 A X ID1,ID2 3.166667 < 3 2.753785
#1 B Y ID4 10.000000 > 1 NaN
#2 B Z ID4 7.000000 = 1 NaN
#3 C Y ID3 1.000000 < 1 NaN
I don't know how to place this timing, especially because I imagine .agg
is a form of .apply
, so I might not be using the best possible approach, having seen the previous results.
On a real dataset I am handling, with about 450 K records that are reduced to 230 K by grouping, this aggregation takes about 1 minute.
I am going to have to handle much larger datasets, and this might become an issue.
Would anyone be able to suggest a more efficient/performant approach to do the grouped calculations I described?
Or is this thought to be state-of-the-art performance for this kind of operation? I really have no benchmarks to judge that, hence my asking the question here.
For the last two steps, I think I will use list comprehensions, like:
%timeit df_summary_by_S_M['TV_mean'] = [eval(t) for x, t in \
zip(df_summary_by_S_M['V_mean'], df_summary_by_S_M['M'].map(M_to_transf_dict))]
# 326 µs ± 5.06 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd TV_mean
#0 A X ID1,ID2 3.166667 < 3 2.753785 3.166667
#1 B Y ID4 10.000000 > 1 NaN 22026.465795
#2 B Z ID4 7.000000 = 1 NaN 0.845098
#3 C Y ID3 1.000000 < 1 NaN 2.718282
and:
df_summary_by_S_M['V_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df_summary_by_S_M['V_mean_c'], df_summary_by_S_M['V_mean'])]
df_summary_by_S_M['TV_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df_summary_by_S_M['V_mean_c'], df_summary_by_S_M['TV_mean'])]
df_summary_by_S_M
# S M ID V_mean V_mean_c V_N V_sample_sd \
#0 A X ID1,ID2 <3.1666666666666665 < 3 2.753785
#1 B Y ID4 >10.0 > 1 NaN
#2 B Z ID4 7.0 = 1 NaN
#3 C Y ID3 <1.0 < 1 NaN
# TV_mean
#0 <3.1666666666666665
#1 >22026.465794806718
#2 0.845098
#3 <2.718281828459045
Again, unless someone can suggest a more efficient alternative(?).
EDIT trying out @mozway's code
Original code, all operations timed together.
import pandas as pd
import numpy as np
from math import exp, log10
df0 = pd.DataFrame({'S' : ['C', 'A', 'A', 'B', 'B', 'A'],
'ID': ['ID3', 'ID1', 'ID2', 'ID4', 'ID4', 'ID1'],
'M' : ['Y', 'X', 'X', 'Y', 'Z', 'X'],
'V' : ['<1', '<0.5', 3, '>10', 7, 6]
})
M_to_transf_dict = {'X' : 'x', 'Y' : 'exp(x)', 'Z' : 'log10(x)'}
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df = df0.copy()
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
df = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
df['TV_mean'] = [eval(t) for x, t in \
zip(df['V_mean'], df['M'].map(M_to_transf_dict))]
df['V_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df['V_mean_c'], df['V_mean'])]
df['TV_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df['V_mean_c'], df['TV_mean'])]
Result:
# 5.29 ms ± 100 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@mozway's code.
def majority(s):
mode = s.mode()
return '=' if len(mode)>1 else mode.iloc[0]
M_dic = {'Y' : np.exp, 'Z' : np.log10}
df = df0.copy()
%%timeit
(df
.join(df['V']
.str.extract('(?P<V_c>\D)?(?P<V_u>\d+(?:\.\d+)?)')
.astype({'V_u': float}).fillna({'V_c': '=', 'V_u': df['V']})
)
.assign(TV=lambda d: d.groupby('M')['V_u'].apply(lambda g: M_dic[g.name](g) if g.name in M_dic else g))
.groupby(['S','M'], as_index = False)
.agg(**{'ID': ('ID', lambda x: ','.join(x.unique())),
'V_mean': ('V_u', 'mean'),
'V_mean_c': ('V_c', majority), ## FIXME
'V_N': ('V_u', 'count'),
'V_sample_sd': ('V_u', 'std'),
'TV_mean': ('TV', 'mean'),
})
.assign(TV_mean=lambda d: d['V_mean_c'].mask(d['V_mean_c'].eq('='), '')+d['TV_mean'].astype(str))
)
Result:
# 8.05 ms ± 259 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
EDIT 2 trying again, on a more realistic simulated dataset
import pandas as pd
import numpy as np
from numpy.random import default_rng
from math import exp, log10
# Simulate dataset to process
rng = default_rng(12345)
S = rng.choice(range(250000), 450000)
ID = S.copy()
S = [f"S_{Si}" for Si in S]
ID = [f"ID_{IDi}" for IDi in ID]
pM = [np.sqrt(1+i) for i in range(100)]
pM = pM / np.sum(pM)
M = rng.choice(range(100), 450000, p = pM)
M_to_transf_dict = dict()
for i in range(0,10):
M_to_transf_dict[f"M_{i}"] = 'exp(x)'
for i in range(10,30):
M_to_transf_dict[f"M_{i}"] = 'x'
for i in range(30,100):
M_to_transf_dict[f"M_{i}"] = 'log10(x)'
M = [f"M_{Mi}" for Mi in M]
V = rng.random(450000)
Q = rng.choice(['', '<', '>'], 450000, p = [0.9, 0.05, 0.05])
V = [f"{q}{v}" for q, v in zip(Q, V)]
df0 = pd.DataFrame({'S' : S, 'ID' : ID, 'M' : M, 'V' : V})
Original code:
def unc(v):
try:
u = float(v)
c = "="
except ValueError:
v = str(v)
u = float(v[1:])
c = v[0]
except:
u = pd.NA
c = "="
return [u, c]
def majority(c_list):
c_list = list(c_list)
Nlt = c_list.count('<')
Ngt = c_list.count('>')
if Nlt + Ngt == 0:
c = '='
else:
c = sorted(zip([Nlt, Ngt], ['<','>']))[1][1]
return c
%%timeit
df = df0.copy()
uc = df['V'].map(unc)
u, c = [], []
for uci in uc:
u.append(uci[0])
c.append(uci[1])
df['V_u'] = u
df['V_c'] = c
df = df.groupby(['S','M'], as_index = False).agg(
ID = pd.NamedAgg(column = 'ID', aggfunc = lambda x : ','.join(np.unique(x))),
V_mean = pd.NamedAgg(column = 'V_u', aggfunc = 'mean'),
V_mean_c = pd.NamedAgg(column = 'V_c', aggfunc = majority),
V_N = pd.NamedAgg(column = 'V_u', aggfunc = 'count'),
V_sample_sd = pd.NamedAgg(column = 'V_u', aggfunc = 'std')
)
df['TV_mean'] = [eval(t) for x, t in \
zip(df['V_mean'], df['M'].map(M_to_transf_dict))]
df['V_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df['V_mean_c'], df['V_mean'])]
df['TV_mean'] = [c + str(v) if c != '=' else v for c, v in \
zip(df['V_mean_c'], df['TV_mean'])]
Result:
# 20 s ± 289 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@mozway's code (M_dic
changed according to the new set of M
's):
def majority(s):
mode = s.mode()
return '=' if len(mode)>1 else mode.iloc[0]
M_dic = dict()
for k in M_to_transf_dict:
if M_to_transf_dict[k] == 'log10(x)':
M_dic[k] = np.log10
elif M_to_transf_dict[k] == 'exp(x)':
M_dic[k] = np.exp
df = df0.copy()
%%timeit
(df
.join(df['V']
.str.extract('(?P<V_c>\D)?(?P<V_u>\d+(?:\.\d+)?)')
.astype({'V_u': float}).fillna({'V_c': '=', 'V_u': df['V']})
)
.assign(TV=lambda d: d.groupby('M')['V_u'].apply(lambda g: M_dic[g.name](g) if g.name in M_dic else g))
.groupby(['S','M'], as_index = False)
.agg(**{'ID': ('ID', lambda x: ','.join(x.unique())),
'V_mean': ('V_u', 'mean'),
'V_mean_c': ('V_c', majority), ## FIXME
'V_N': ('V_u', 'count'),
'V_sample_sd': ('V_u', 'std'),
'TV_mean': ('TV', 'mean'),
})
.assign(TV_mean=lambda d: d['V_mean_c'].mask(d['V_mean_c'].eq('='), '')+d['TV_mean'].astype(str))
)
Result:
# 52.3 s ± 436 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
EDIT 3 trying again, after simulating a dataset with duplicated {S, M}
entries.
rng = default_rng(12345)
S = rng.choice(range(200000), 225000)
ID = S.copy()
S = [f"S_{Si}" for Si in S]
ID = [f"ID_{IDi}" for IDi in ID]
pM = [np.sqrt(1+i) for i in range(100)]
pM = pM / np.sum(pM)
M = rng.choice(range(100), 225000, p = pM)
M_to_transf_dict = dict()
for i in range(0,10):
M_to_transf_dict[f"M_{i}"] = 'exp(x)'
for i in range(10,30):
M_to_transf_dict[f"M_{i}"] = 'x'
for i in range(30,100):
M_to_transf_dict[f"M_{i}"] = 'log10(x)'
M = [f"M_{Mi}" for Mi in M]
S = S + S
ID = ID + ID
M = M + M
V = rng.random(450000)
Q = rng.choice(['', '<', '>'], 450000, p = [0.9, 0.05, 0.05])
V = [f"{q}{v}" for q, v in zip(Q, V)]
df0 = pd.DataFrame({'S' : S, 'ID' : ID, 'M' : M, 'V' : V})
Original code:
10.6 s ± 154 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
@mozway's code:
25.8 s ± 550 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
(and the output looks different).