1

UPDATE2: I actually have 2000 draws not 3.

UPDATE: My df column A was wrong. I fixed it.

I have a really large version of df below.

data = {'A':[11111, 11111, 33333,11111], 'B':[101, 101, 102, 101],'C':[1,2,3,4],
    'draw0':[5, 6, 2, 1], 'draw1':[4,3,2,1], 'draw2':[2,3,4,6]}
df = pd.DataFrame(data)

     A     B   C  draw0   draw1   draw2
0  11111  101  1      5      4      2
1  11111  101  2      6      3      3
2  33333  102  3      2      2      4
3  11111  101  4      1      1      6

I am trying to find which of the draw columns wins for each draw. Below is my current attempt, but its slow, but works. I feel like there should be a way with apply or something to make it faster.

draw_cols = [col for col in df if col.startswith('draw')]

for col in draw_cols:
    max_idx = df.groupby(['A', 'B'])[col].idxmax().values
    df.loc[max_idx, col] = 1
    df.loc[~df.index.isin(max_idx), col] = 0

Desired Output:

     A     B   C  draw0  draw1  draw2
0  11111  101  1      0      1      0
1  11111  101  2      1      0      0
2  33333  102  3      1      1      1
3  11111  101  4      0      0      1

I generate the 2000 columns like so:

def simulateDraw(df, n=2000):
    
    #simulate n drawings from the alpha and beta values and create columns 
    return pd.concat([df,
           df.apply(lambda row: pd.Series(np.random.beta(row.C, row.C, size=n)), axis = 1).add_prefix('draw')],
          axis = 1)
Climbs_lika_Spyder
  • 6,004
  • 3
  • 39
  • 53

3 Answers3

3
# groupby and transform the idxmax
max_idx = df.groupby(['A', 'B'])[df.columns[3:]].transform('idxmax')
# create a new column that is just your index
# this is done just in case your real data does not have a range index
max_idx['index'] = max_idx.index.values
# where the max_idx is in the index to return bool values and then update the original df
df.update(max_idx.isin(max_idx['index']).astype(int))

       A    B  C  draw0  draw1  draw2
0  11111  101  1      0      1      0
1  11111  101  2      1      0      0
2  33333  102  3      1      1      1
3  11111  101  4      0      0      1
It_is_Chris
  • 13,504
  • 2
  • 23
  • 41
1

Check for every group which draw column equals the max of the column

df.update(df.groupby(['A','B'])[['draw0','draw1','draw2']].apply(lambda x: x.eq(x.max(0))).astype('int'))
df

Out:

       A    B  C  draw0  draw1  draw2
0  11111  101  1      0      1      0
1  11111  101  2      1      0      0
2  33333  102  3      1      1      1
3  11111  101  4      0      0      1

Micro-Benchmark

Results for simulateDraw(df, n=4)

4 draw columns benchmark

Results for simulateDraw(df, n=50) (more rows or columns exceeded my patience and RAM on a colab instance)

50 draw columns benchmak

Code used for the benchmark

import pandas as pd
import numpy as np
import perfplot

def simulateDraw(df, n=2000):
    return pd.concat([df,
           df.apply(lambda row: pd.Series(np.random.beta(row.C, row.C, size=n)), axis = 1).add_prefix('draw')],
          axis = 1)

def makedata(n=1):
    data = pd.DataFrame({'A':[11111, 11111, 33333,11111] * n, 'B':[101, 101, 102, 101] * n,'C':[1,2,3,4] * n})
    data = simulateDraw(data)
    return data

def forloop(df):
    draw_cols = [col for col in df if col.startswith('draw')]
    for col in draw_cols:
        max_idx = df.groupby(['A', 'B'])[col].idxmax().values
        df.loc[max_idx, col] = 1
        df.loc[~df.index.isin(max_idx), col] = 0
    return df

def applyeq(df):
    draw_cols = [col for col in df if col.startswith('draw')]
    df.update(df.groupby(['A','B'])[draw_cols].apply(lambda x: x.eq(x.max(0))).astype('int'))
    return df


def idxmax(df):
    draw_cols = [col for col in df if col.startswith('draw')]
    max_idx = df.groupby(['A', 'B'])[draw_cols].transform('idxmax')
    max_idx['index'] = max_idx.index.values
    df.update(max_idx.isin(max_idx['index']).astype(int))
    return df


perfplot.show(
    setup=makedata,
    kernels=[idxmax,applyeq,forloop],
    n_range=[2**k for k in range(5,22)],
    xlabel='len(df)'
)
Michael Szczesny
  • 4,911
  • 5
  • 15
  • 32
0

this nested list comprehension will not need groupby but could be faster in updating the values (it substitutes the need for 'apply lambda' which is applied to every element with np.where). it could be more efficient, if your dataframe is of large size (i have not run any performance metrics though!)

out = pd.concat(
            [
                pd.concat(
                            [
                                pd.DataFrame(
                                                np.where(
                                                            df.loc[df.B.isin([i]),['draw0','draw1','draw2']]==df.loc[df.B.isin([i]),['draw0','draw1','draw2']].max().to_numpy()[None,:],1,0
                                                        )
                                            ).reset_index(drop=True),\
                               df.loc[df.B.isin([i]),['A','B','C']].reset_index(drop=True)
                            ], axis=1, sort=False, ignore_index=True
                        ) for i in df.B.unique()
            ], axis=0, sort=False, ignore_index=True
            )


out.rename(columns = {0:'draw0',1:'draw1',2:'draw2',3:'A',4:'B',5:'C'}, inplace=True)
Siraj S.
  • 3,481
  • 3
  • 34
  • 48