Question
How do I measure the performance of the various functions below in a concise and comprehensive way.
Example
Consider the dataframe df
df = pd.DataFrame({
'Group': list('QLCKPXNLNTIXAWYMWACA'),
'Value': [29, 52, 71, 51, 45, 76, 68, 60, 92, 95,
99, 27, 77, 54, 39, 23, 84, 37, 99, 87]
})
I want to sum up the Value
column grouped by distinct values in Group
. I have three methods for doing it.
import pandas as pd
import numpy as np
from numba import njit
def sum_pd(df):
return df.groupby('Group').Value.sum()
def sum_fc(df):
f, u = pd.factorize(df.Group.values)
v = df.Value.values
return pd.Series(np.bincount(f, weights=v).astype(int), pd.Index(u, name='Group'), name='Value').sort_index()
@njit
def wbcnt(b, w, k):
bins = np.arange(k)
bins = bins * 0
for i in range(len(b)):
bins[b[i]] += w[i]
return bins
def sum_nb(df):
b, u = pd.factorize(df.Group.values)
w = df.Value.values
bins = wbcnt(b, w, u.size)
return pd.Series(bins, pd.Index(u, name='Group'), name='Value').sort_index()
Are they the same?
print(sum_pd(df).equals(sum_nb(df)))
print(sum_pd(df).equals(sum_fc(df)))
True
True
How fast are they?
%timeit sum_pd(df)
%timeit sum_fc(df)
%timeit sum_nb(df)
1000 loops, best of 3: 536 µs per loop
1000 loops, best of 3: 324 µs per loop
1000 loops, best of 3: 300 µs per loop