This will print the progress as you go, where progress is measured by the fraction of the groups for which statistics are computed. But I'm not sure how much the loop will slow down your computations.
agger = {
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()}
gcols = ['B'] # columns defining the groups
groupby = df.groupby(gcols)
ngroups = len(groupby)
gfrac = 0.3 # fraction of groups for which you want to print progress
gfrac_size = max((1, int(ngroups*gfrac)))
groups = []
rows = []
for i,g in enumerate(groupby):
if (i+1)%gfrac_size == 0:
print('{:.0f}% complete'.format(100*(i/ngroups)))
gstats = g[1].agg(agger)
if i==0:
if gstats.ndim==2:
newcols = gstats.columns.tolist()
else:
newcols = gstats.index.tolist()
groups.append(g[0])
rows.append(gstats.values.flat)
df3 = pd.DataFrame(np.vstack(rows), columns=newcols)
if len(gcols) == 1:
df3.index = groups
else:
df3.index = pd.MultiIndex.from_tuples(groups, names=gcols)
df3 = df3.astype(df[newcols].dtypes)
df3
C D E
1.0 1.5 10.0 a
2.0 3.0 12.0 b
3.0 7.0 8.0 a
An alternative (though somewhat hacky) way would be to take advantage of the fact that you use your own function lambda x: x.mode
. Since you're already compromising speed using this function, you can write a class that stores information about progress. For example,
import pandas as pd
import numpy as np
df = pd.DataFrame({"A":[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
"B":[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0],
"C":[1.0, 1.5, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0],
"D":[2.0, 5.0, 3.0, 6.0, 4.0, 2.0, 5.0, 1.0, 2.0],
"E":['a', 'a', 'b', 'a', 'b', 'b', 'b', 'a', 'a']})
df2 = df.groupby('B').agg({
'C': 'mean',
'D': 'sum',
'E': lambda x: x.mode()
})
print(df2)
class ModeHack:
def __init__(self, size=5, N=10):
self.ix = 0
self.K = 1
self.size = size
self.N = N
def mode(self, x):
self.ix = self.ix + x.shape[0]
if self.K*self.size <= self.ix:
print('{:.0f}% complete'.format(100*self.ix/self.N))
self.K += 1
return x.mode()
def reset(self):
self.ix = 0
self.K = 1
mymode = ModeHack(size=int(.1*df.shape[0]), N=df.shape[0])
mymode.reset()
agger = {
'C': 'mean',
'D': 'sum',
'E': lambda x: mymode.mode(x)}
df3 = df.groupby('B').agg(agger)