Most of the test results discussed in other answers are skewed due to measuring using a trivially small test DataFrame as input. Pandas has some fixed but generally negligible setup time, but it will appear significant next to processing this tiny dataset.
On a larger dataset, the fastest method is using pd.Series.mode()
with agg()
:
df.groupby('name')['color'].agg(pd.Series.mode)
Test bench:
arr = np.array([
('John', 1, 'White'),
('John', 2, 'White'),
('John', 3, 'Blue'),
('John', 4, 'Blue'),
('John', 5, 'White'),
('Tom', 2, 'White'),
('Tom', 3, 'Blue'),
('Tom', 4, 'Blue'),
('Tom', 5, 'Black'),
('Jerry', 1, 'Black'),
('Jerry', 2, 'Black'),
('Jerry', 4, 'Black'),
('Jerry', 5, 'White')],
dtype=[('name', 'O'), ('day', 'i8'), ('color', 'O')])
from timeit import Timer
from itertools import groupby
from collections import Counter
df = pd.DataFrame.from_records(arr).sample(100_000, replace=True)
def factorize():
i, r = pd.factorize(df.name)
j, c = pd.factorize(df.color)
n, m = len(r), len(c)
b = np.zeros((n, m), dtype=np.int64)
np.add.at(b, (i, j), 1)
return pd.Series(c[b.argmax(1)], r)
t_factorize = Timer(lambda: factorize())
t_idxmax = Timer(lambda: df.groupby(['name', 'color']).size().unstack().idxmax(1))
t_aggmode = Timer(lambda: df.groupby('name')['color'].agg(pd.Series.mode))
t_applymode = Timer(lambda: df.groupby('name').color.apply(pd.Series.mode).reset_index(level=1,drop=True))
t_aggcounter = Timer(lambda: df.groupby('name')['color'].agg(lambda c: Counter(c).most_common(1)[0][0]))
t_applycounter = Timer(lambda: df.groupby('name').color.apply(lambda c: Counter(c).most_common(1)[0][0]))
t_itertools = Timer(lambda: pd.Series(
{x: Counter(z[-1] for z in y).most_common(1)[0][0] for x,y
in groupby(sorted(df.values.tolist()), key=lambda x: x[0])}))
n = 100
[print(r) for r in (
f"{t_factorize.timeit(number=n)=}",
f"{t_idxmax.timeit(number=n)=}",
f"{t_aggmode.timeit(number=n)=}",
f"{t_applymode.timeit(number=n)=}",
f"{t_applycounter.timeit(number=n)=}",
f"{t_aggcounter.timeit(number=n)=}",
f"{t_itertools.timeit(number=n)=}",
)]
t_factorize.timeit(number=n)=1.325189442
t_idxmax.timeit(number=n)=1.0613339019999999
t_aggmode.timeit(number=n)=1.0495010750000002
t_applymode.timeit(number=n)=1.2837302849999999
t_applycounter.timeit(number=n)=1.9432825890000007
t_aggcounter.timeit(number=n)=1.8283823839999993
t_itertools.timeit(number=n)=7.0855046380000015