This is one of the reasons why keeping iterable objects as values inside a dataframe should be avoided. Most pandas functions do not agree with this kind of structure.
That being said, you can get your solution through df.explode
and df.transform
among other tricks.
import pandas as pd
import numpy as np
# sample data
# please always provide a callable line of code with your data
# you can get it with df.head(10).to_dict('split')
# read more about this in https://stackoverflow.com/q/63163251/6692898
# and https://stackoverflow.com/q/20109391/6692898
np.random.seed(1)
df = pd.DataFrame({
'FileName': ['FileName1', 'FileName2', 'FileName3'],
'Num': [3, 5, 4],
'Onsets': [np.random.randint(1, 1000, 12) for _ in range(3)],
'Offsets': [np.random.randint(1, 1000, 12) for _ in range(3)],
'Durations': [np.random.randint(1, 1000, 12) for _ in range(3)]
})
print(df)
FileName Num Onsets \
0 FileName1 3 [38, 236, 909, 73, 768, 906, 716, 646, 848, 96...
1 FileName2 5 [973, 584, 750, 509, 391, 282, 179, 277, 255, ...
2 FileName3 4 [908, 253, 491, 669, 926, 399, 563, 581, 216, ...
Offsets \
0 [479, 865, 87, 142, 394, 8, 320, 830, 535, 314...
1 [317, 210, 265, 729, 654, 628, 432, 634, 457, ...
2 [455, 918, 562, 314, 516, 965, 793, 498, 44, 5...
Durations
0 [337, 622, 884, 298, 467, 16, 65, 197, 26, 368...
1 [904, 283, 666, 617, 23, 778, 708, 127, 280, 3...
2 [934, 314, 596, 167, 649, 289, 419, 779, 280, ...
The code
# work with a temp dataframe
df2 = df[['FileName', 'Num', 'Durations']].explode('Durations')
df2.Durations = df2.Durations.astype(int) # needed only because of how the sample was created
# should not be necessary with your dataframe
df2['tag'] = ( # add cyclic tags to each row, within each FileName
df2.groupby('FileName').Durations.transform('cumcount') # similar to range(len(group))
% df2.Num # get the modulo of the row number within the group
)
# get averages and collect into lists
df2 = df2.groupby(['FileName', 'tag']).Durations.mean() # get average
df2.rename('Duration_avgs', inplace=True)
# collect in a list by Filename and merge with original df
df = df.merge(df2.groupby('FileName').agg(list), on='FileName')
Output
FileName Num Onsets \
0 FileName1 3 [38, 236, 909, 73, 768, 906, 716, 646, 848, 96...
1 FileName2 5 [973, 584, 750, 509, 391, 282, 179, 277, 255, ...
2 FileName3 4 [908, 253, 491, 669, 926, 399, 563, 581, 216, ...
Offsets \
0 [479, 865, 87, 142, 394, 8, 320, 830, 535, 314...
1 [317, 210, 265, 729, 654, 628, 432, 634, 457, ...
2 [455, 918, 562, 314, 516, 965, 793, 498, 44, 5...
Durations \
0 [337, 622, 884, 298, 467, 16, 65, 197, 26, 368...
1 [904, 283, 666, 617, 23, 778, 708, 127, 280, 3...
2 [934, 314, 596, 167, 649, 289, 419, 779, 280, ...
Durations_avgs
0 [267.0, 506.25, 349.5]
1 [679.6666666666666, 382.3333333333333, 396.5, ...
2 [621.0, 419.6666666666667, 589.0, 344.66666666...
Update
Kshitij shows a good idea of defining a function for this (in case you want to do it for several columns). But is is good to avoid apply
if it can be done with pandas native functions.
Here is a function that does that dynamically for any column:
def get_averages(df: pd.DataFrame, column: str) -> pd.DataFrame:
'''
Add a column inplace, with the averages
of each `Num` cyclical item for each row
'''
# work with a new dataframe
df2 = (
df[['FileName', 'Num', column]]
.explode('Durations', ignore_index=True)
)
# needed only because of how the sample was created
# should not be necessary with your dataframe
df2.Durations = df2.Durations.astype(int)
df2['tag'] = ( # add cyclic tags to each row, within each FileName
df2.groupby('FileName')[column]
.transform('cumcount') # similar to range(len(group))
% df2.Num # get the modulo of the row number within the group
)
# get averages and collect into lists
df2 = df2.groupby(['FileName', 'tag'])[column].mean() # get average
df2.rename(f'{column}_avgs', inplace=True)
# collect in a list by Filename and merge with original df
df2 = df2.groupby('FileName').agg(list)
df = df.merge(df2, on='FileName')
return df
df = get_averages(df, 'Durations')