2

I have the following data frame:

import pandas as pd

df = pd.DataFrame({'member': ['john', 'john', 'john', 'jake', 'jake', 'jake', 'jim', 'jim', 'jim'],
                   'age': [42, 43, 44, 35, 36, 37, 57, 58, 59],
                   'inpatient_count': [0, 1, 2, 1, 0, 0, 2, 1, 5],
                   'pcp_count': [4, 0, 6, 0, 3, 3, 0, 5, 2]})

df = df.sort_values('member')

print(df)
  member  age  inpatient_count  pcp_count
3   jake   35                1          0
4   jake   36                0          3
5   jake   37                0          3
6    jim   57                2          0
7    jim   58                1          5
8    jim   59                5          2
0   john   42                0          4
1   john   43                1          0
2   john   44                2          6

I would like to transform df into arrays that are grouped/nested by member, as done below, but I would like for something much faster when running over millions of members. I was hoping pd.to_numpy() would have a grouper argument, but I haven't figured it out yet.

import numpy as np

keep = [x for x in df.columns if x != 'member']
np.array(df.groupby('member')[keep].apply(lambda x: x.values.tolist()).tolist())

array([[[35,  1,  0],
        [36,  0,  3],
        [37,  0,  3]],

       [[57,  2,  0],
        [58,  1,  5],
        [59,  5,  2]],

       [[42,  0,  4],
        [43,  1,  0],
        [44,  2,  6]]])

bshelt141
  • 1,183
  • 15
  • 31
  • 1
    Maybe have look at this [answer](https://stackoverflow.com/a/53859634/9274732) with for you `np.split(a[:,1:], np.u...` and `a=df.to_numpy()` – Ben.T Feb 08 '21 at 16:46

1 Answers1

3

Since you bother sorting before, you can use np.arraysplit to figure out where to cut the array. First make sure to reset the index so it's an ordered IntIndex.

import numpy as np

df = df.sort_values('member').reset_index(drop=True)

splits = df['member'].ne(df['member'].shift()).loc[lambda x: x].index[1:]
np.array(np.array_split(df.drop(columns='member').to_numpy(), splits))
#array([[[35,  1,  0],
#        [36,  0,  3],
#        [37,  0,  3]],
# 
#       [[57,  2,  0],
#        [58,  1,  5],
#        [59,  5,  2]],
#
#       [[42,  0,  4],
#        [43,  1,  0],
#        [44,  2,  6]]])

Timed the two methods, the array_split (after sorting) is a good gain.

import perfplot
import pandas as pd
import numpy as np

def groupby_apply(df):
    keep = [x for x in df.columns if x != 'member']
    return np.array(df.groupby('member')[keep].apply(lambda x: x.values.tolist()).tolist())

def array_split(df):
    splits = df['member'].ne(df['member'].shift()).loc[lambda x: x].index[1:]
    return np.array(np.array_split(df.drop(columns='member').to_numpy(), splits))


perfplot.show(
    setup=lambda n: pd.DataFrame({'member': np.repeat(np.arange(1, n//2+1), 2),
                                  'age': np.random.randint(1, 100, n),
                                  'inpatient_count': np.random.randint(1, 10, n),
                                  'pcp_count': np.random.randint(1, 10, n)}),
    kernels=[
        lambda df: groupby_apply(df),
        lambda df: array_split(df),
    ],
    labels=['GroupBy Apply', 'np.array_split'],
    n_range=[2 ** k for k in range(2, 17)],
    equality_check=np.allclose, 
    xlabel='N Groups'
)

enter image description here

ALollz
  • 57,915
  • 7
  • 66
  • 89