Based on the output dataframe from this link:
import pandas as pd
import numpy as np
np.random.seed(2021)
dates = pd.date_range('20130226', periods=90)
df = pd.DataFrame(np.random.uniform(0, 10, size=(90, 6)), index=dates, columns=['A_values', 'B_values', 'C_values', 'D_values', 'E_values', 'target'])
models = df.columns[df.columns.str.endswith('_values')]
# function to calculate mape
def mape(y_true, y_pred):
y_pred = np.array(y_pred)
return np.mean(np.abs(y_true - y_pred) / np.clip(np.abs(y_true), 1, np.inf),
axis=0)*100
errors = (df.groupby(pd.Grouper(freq='M'))
.apply(lambda x: mape(x[models], x[['target']]))
)
k = 2
n = len(models)
sorted_args = np.argsort(errors, axis=1) < k
res = pd.merge_asof(df[['target']], sorted_args,
left_index=True,
right_index=True,
direction='forward'
)
topk = df[models].where(res[models])
df = df.join(topk.add_suffix('_mape'))
df = df[['target', 'A_values_mape', 'B_values_mape', 'C_values_mape', 'D_values_mape',
'E_values_mape']]
df
Out:
target A_values_mape ... D_values_mape E_values_mape
2013-02-26 1.281624 6.059783 ... 3.126731 NaN
2013-02-27 0.585713 1.789931 ... 7.843101 NaN
2013-02-28 9.638430 9.623960 ... 5.612724 NaN
2013-03-01 1.950960 NaN ... NaN 5.693051
2013-03-02 0.690563 NaN ... NaN 7.322250
... ... ... ... ...
2013-05-22 5.554824 NaN ... NaN 6.803052
2013-05-23 8.440801 NaN ... NaN 2.756443
2013-05-24 0.968086 NaN ... NaN 0.430184
2013-05-25 0.672555 NaN ... NaN 5.461017
2013-05-26 5.273122 NaN ... NaN 6.312104
How could I groupby year-month and drop columns with all NaN
s, then rename the rest columns by ie., top_1, top_2, ..., top_k
?
The final expected result could be like this if k=2
:
Pseudocode:
df2 = df.filter(regex='_mape$').groupby(pd.Grouper(freq='M')).dropna(axis=1, how='all')
df2.columns = ['top_1', 'top_2', ..., 'top_k']
df.join(df2)
As @Quang Hoang commented in the last post, we may could use justify_nd
to achieve that, but I don't know how. Thanks for your help at advance.
EDIT:
dates = pd.date_range('20130226', periods=90)
df = pd.DataFrame(np.random.uniform(0, 10, size=(90, 6)), index=dates, columns=['A_values', 'B_values', 'C_values', 'D_values', 'E_values', 'target'])
models = df.columns[df.columns.str.endswith('_values')]
k = 2
n = len(models)
def grpProc(grp):
err = mape(grp[models], grp[['target']])
# sort_args = np.argsort(err) < k
# cols = models[sort_args]
cols = err.nsmallest(k).index
out_cols = [f'top_{i+1}' for i in range(k)]
rv = grp.loc[:, cols]
rv.columns = out_cols
return rv
wrk = df.groupby(pd.Grouper(freq='M')).apply(grpProc)
res = df[['target']].join(wrk)
print(res)
Out:
target top_1 top_2
2013-02-26 1.281624 6.059783 9.972433
2013-02-27 0.585713 1.789931 0.968944
2013-02-28 9.638430 9.623960 6.165247
2013-03-01 1.950960 4.521452 5.693051
2013-03-02 0.690563 5.178144 7.322250
... ... ...
2013-05-22 5.554824 3.864723 6.803052
2013-05-23 8.440801 5.140268 2.756443
2013-05-24 0.968086 5.890717 0.430184
2013-05-25 0.672555 1.610210 5.461017
2013-05-26 5.273122 6.893207 6.312104