In pandas iterrows
is not recommneded, check this answer. Alternative pandas only solution with same logic like your solution - per groups test if at least one match, filter by boolean indexing
and convert to dictioanry per id
with DataFrame.to_dict
:
def f(x):
a = x.to_numpy()
return np.any(a == -a[:, None], axis=1)
d = (df[df.groupby('id')['amount'].transform(f)]
.groupby('id')[['label','amount']]
.apply(lambda x: x.to_dict(orient='records'))
.to_dict())
print (d)
{1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]}
Or is possible filter all duplicated with absolute amount
, but also is necessary test if exist negative with DataFrameGroupBy.nunique
per groups:
f = lambda x: x.to_dict(orient='records')
df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) &
df1.duplicated(['id','amount'], keep=False))
d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
print (d)
{1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]}
If small DataFrame and performance not important need test values per groups by id
, for test use ==
with Series.any
:
from collections import defaultdict
df['negative_amount'] = - df['amount']
d = defaultdict(list)
for i, g in df.groupby('id'):
for index,row in g.iterrows():
if (g['negative_amount'] == row['amount']).any():
d[i].append({ "label": row["label"], 'amount':row['amount'] })
print (d)
defaultdict(<class 'list'>, {1: [{'label': 'A', 'amount': -12},
{'label': 'B', 'amount': 12},
{'label': 'G', 'amount': -12}],
2: [{'label': 'A', 'amount': 5},
{'label': 'BB', 'amount': -5}],
4: [{'label': 'AA', 'amount': 10},
{'label': 'AA', 'amount': -10}]})
Performance in 10k
rows and 1k
groups, last solution is slowiest:
np.random.seed(123)
N = 10000
d = {'id': np.random.randint(1000, size=N),
'label':np.random.choice(['A','A','B','G','A','BB','C','C','A','BB','B','AA','AA'], size=N),
'amount':np.random.choice([2,-12,12,-12,5,-5,2,3,5,3,3,10,-10], size=N)}
df = pd.DataFrame(d).sort_values('id', ignore_index=True)
print(df)
In [47]: %%timeit
...: def f(x):
...: a = x.to_numpy()
...: return np.any(a == - a[:, None], axis=1)
...:
...: d = (df[df.groupby('id')['amount'].transform(f)]
...: .groupby('id')[['label','amount']]
...: .apply(lambda x: x.to_dict(orient='records'))
...: .to_dict())
...:
225 ms ± 1.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [48]: %%timeit
...: f = lambda x: x.to_dict(orient='records')
...: df1 = df.assign(amount = df['amount'].abs(), new=np.sign(df['amount']))
...: m = (df1.groupby(['id','amount'])['new'].transform('nunique').gt(1) &
...: df1.duplicated(['id','amount'], keep=False))
...:
...: d = df[m].groupby('id')[['label','amount']].apply(f).to_dict()
...:
124 ms ± 9.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [49]: %%timeit
...: df['negative_amount'] = - df['amount']
...: d = defaultdict(list)
...:
...: for i, g in df.groupby('id'):
...: for index,row in g.iterrows():
...: if (g['negative_amount'] == row['amount']).any():
...: d[i].append({ "label": row["label"], 'amount':row['amount'] })
...:
3.51 s ± 366 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)