I have a datetime index pandas with hourly data that needs to be interpolated when nans
exist. Sometimes 1 hour is missing and a linear interpolation would be enough, but sometimes it can be days, in which case I'd need it to consider the behavior it has had on the last week on average to fill the values. Any ideas on how to do this?
Right now I am using df.interpolate(method="linear")
but I need to fix it so that it considers the average of the last seven days (with data) when there is a large gap (more than 3 consecutive hours). So for example, if 2017-07-02 04:00:00
is missing, and is a part of a large gap, it should fill the value with the average value that occurs everyday at 4 am between the days of 2017-06-25
and "2017-07-02".
This is an example dataset
D31 D32
time
2017-07-01 00:00:00 118.0 118.0
2017-07-01 01:00:00 126.0 126.0
2017-07-01 02:00:00 96.0 np.nan
2017-07-01 03:00:00 88.0 88.0
2017-07-01 04:00:00 76.0 76.0
2017-07-01 05:00:00 60.0 60.0
2017-07-01 06:00:00 59.0 59.0 2017-07-01 07:00:00 53.0 53.0
2017-07-01 08:00:00 54.0 54.0
2017-07-01 09:00:00 47.0 47.0
2017-07-01 10:00:00 48.0 48.0
2017-07-01 11:00:00 56.0 56.0
2017-07-01 12:00:00 65.0 65.0
2017-07-01 13:00:00 57.0 57.0
2017-07-01 14:00:00 46.0 46.0
2017-07-01 15:00:00 39.0 39.0
2017-07-01 16:00:00 24.0 24.0
2017-07-01 17:00:00 22.0 22.0
2017-07-01 18:00:00 np.nan 28.0
2017-07-01 19:00:00 np.nan 25.0
2017-07-01 20:00:00 38.0 38.0
2017-07-01 21:00:00 52.0 52.0
2017-07-01 22:00:00 123.0 123.0
2017-07-01 23:00:00 np.nan np.nan
2017-07-02 00:00:00 np.nan np.nan
2017-07-02 01:00:00 np.nan np.nan
2017-07-02 02:00:00 np.nan np.nan
2017-07-02 03:00:00 np.nan np.nan
2017-07-02 04:00:00 np.nan np.nan
2017-07-02 05:00:00 np.nan np.nan
2017-07-02 06:00:00 np.nan np.nan 2017-07-02 07:00:00 np.nan np.nan
2017-07-02 08:00:00 np.nan np.nan
2017-07-02 09:00:00 np.nan np.nan
2017-07-02 10:00:00 np.nan np.nan
2017-07-02 11:00:00 np.nan np.nan
2017-07-02 12:00:00 np.nan np.nan
2017-07-02 13:00:00 np.nan np.nan
2017-07-02 14:00:00 np.nan np.nan
2017-07-02 15:00:00 np.nan np.nan
2017-07-02 16:00:00 np.nan np.nan
2017-07-02 17:00:00 np.nan np.nan
2017-07-02 18:00:00 np.nan 28.0
2017-07-02 19:00:00 np.nan 25.0
2017-07-02 20:00:00 38.0 38.0
2017-07-02 21:00:00 52.0 52.0
2017-07-02 22:00:00 123.0 123.0
2017-07-02 23:00:00 130.0 131.0
2017-07-03 00:00:00 115.0 118.0
2017-07-03 01:00:00 126.0 128.0
2017-07-03 02:00:00 96.0 np.nan
2017-07-03 03:00:00 86.0 88.0
2017-07-03 04:00:00 77.0 75.0
2017-07-03 05:00:00 60.0 60.0
2017-07-03 06:00:00 61.0 59.0 2017-07-03 07:00:00 57.0 53.0
2017-07-03 08:00:00 55.0 52.0
2017-07-03 09:00:00 47.0 48.0
2017-07-03 10:00:00 42.0 43.0
2017-07-03 11:00:00 56.0 57.0
2017-07-03 12:00:00 68.0 62.0
2017-07-03 13:00:00 56.0 57.0
2017-07-03 14:00:00 47.0 42.0
2017-07-03 15:00:00 33.0 37.0
2017-07-03 16:00:00 27.0 25.0
2017-07-03 17:00:00 24.0 20.0
2017-07-03 18:00:00 np.nan 28.0
2017-07-03 19:00:00 42.0 42.0
2017-07-03 20:00:00 42.0 42.0
2017-07-03 21:00:00 33.0 33.0
2017-07-03 22:00:00 35.0 35.0
2017-07-03 23:00:00 59.0 59.0
Thanks!
Edit:
Based on W-M response I was able to do the following code which works for what I want.
def interpolate_obs(df):
def long_nan_series(series):
# select this series when all values are NaNs
all_nans = series.isnull().all()
# and the delta is too long for interpolation
# note: taking the last value minus the first,
# so this is the delta between the last NaN
# value and the first NaN value - there's
# an hour duration more until the next non-null value
too_long = series.index[-1] - series.index[0] > pd.Timedelta("3 hours")
return too_long & all_nans
def get_average_value(series, mean_value, date):
result=np.nan
days=0 #(3, 6, 9, 12, 15, 18, 21, 24, 27, 30)
days_mean=-1 #(1, 3, 5, 7, 9, 11, 13, 15, 17, 19)
while result!=result:
days+=3
## If nothing is found within a month (15 days before, 15 days after) then stop
if days>15:
return np.nan
## Get last week
timedelta=pd.Timedelta("%s days"%days)
working_data=series.loc[date-timedelta:date+timedelta]
## Get only the ones that are the same hour
working_data=working_data[working_data.index.hour==date.hour]
result=working_data.mean()
if result==result:
return result
## Get surrounding three days
days_mean+=2
timedelta=pd.Timedelta("%s days"%days_mean)
working_data=mean_value.loc[date-timedelta:date+timedelta]
## Get only the ones that are the same hour
working_data=working_data[workinreg_data.index.hour==date.hour]
result=working_data.mean()
if result==result:
return result
mean_value=df.mean(axis=1)
for col in df.columns:
series=df[col]
df_nan_group_keys = series.isnull().diff().ne(0).cumsum()
series_long_nans = series.groupby(df_nan_group_keys).transform(long_nan_series)
## Small gaps
series_small_gaps=series[~series_long_nans]
series_small_interp = series_small_gaps.interpolate(method="linear")
## Long gaps. Groups them by gaps
series_long_gaps=series[series_long_nans]
time_dif=series_long_gaps.index.to_series().diff()
time_dif[time_dif>pd.Timedelta("1H")]=np.nan
time_dif=time_dif.replace(pd.Timedelta("1H"), 0).replace(np.nan, 1)
time_dif=time_dif.astype(int).cumsum()
## Retrieve each gap and find the new values
both=pd.concat([series_long_gaps, time_dif], axis=1)
both.columns=[col, "group"]
series_long_interp=[]
for group, df_group in both.groupby("group"):
series_long_interp.append(df_group.apply(lambda x: get_average_value(series, mean_value, x.name), axis=1))
series_long_interp=pd.concat(series_long_interp)
df[col]=pd.concat([series_small_interp, series_long_interp]).sort_index()
return df