First convert values to datetimes by to_datetime
:
df1['DateTime'] = pd.to_datetime(df1['DateTime'], format='%Y-%m-%d %H:%M:%f')
df2.index = pd.to_datetime(df2.index)
Then reshape df2
by DataFrame.stack
and DataFrame.reset_index
:
df22 = df2.stack().rename_axis(('dates','Shift')).reset_index(name='value')
print (df22)
dates Shift value
0 2023-01-01 A M
1 2023-01-01 B N
2 2023-01-01 C -
3 2023-01-02 A M
4 2023-01-02 B N
5 2023-01-02 C -
6 2023-01-03 A N
7 2023-01-03 B -
8 2023-01-03 C M
9 2023-01-04 A N
10 2023-01-04 B -
11 2023-01-04 C M
Create helper column by Series.dt.floor
and numpy.where
, so possible use left join with df22
:
df = (df1.assign(dates = df1['DateTime'].dt.floor('D'),
value = np.where(df1['DateTime'].dt.hour.le(12), 'M','N'))
.merge(df22, how='left')
.drop(['dates','value'], axis=1))
print (df)
DateTime State Comment Shift
0 2023-01-01 09:01:00.300 running NaN A
1 2023-01-01 10:10:00.230 stopped Finished A
2 2023-01-01 22:50:00.670 running NaN B
3 2023-01-02 07:58:00.990 stopped Finished A
4 2023-01-02 15:06:00.800 running NaN B
5 2023-01-02 20:23:00.550 stopped User aborted B
6 2023-01-03 07:02:00.650 running NaN C
7 2023-01-03 10:07:00.130 stopped Finished C
If both DatimeIndex
:
df1.index = pd.to_datetime(df1.index, format='%Y-%m-%d %H:%M:%f')
df2.index = pd.to_datetime(df2.index)
df22 = df2.stack().rename_axis(('dates','Shift')).reset_index(name='value')
df = (df1.reset_index()
.assign(dates = df1.index.floor('D'),
value = np.where(df1.index.hour < 12, 'M','N'))
.merge(df22, how='left')
.drop(['dates','value'], axis=1))
print (df)
DateTime State Comment Shift
0 2023-01-01 09:01:00.300 running NaN A
1 2023-01-01 10:10:00.230 stopped Finished A
2 2023-01-01 22:50:00.670 running NaN B
3 2023-01-02 07:58:00.990 stopped Finished A
4 2023-01-02 15:06:00.800 running NaN B
5 2023-01-02 20:23:00.550 stopped User aborted B
6 2023-01-03 07:02:00.650 running NaN C
7 2023-01-03 10:07:00.130 stopped Finished C
Testing performance in 1k rows:
np.random.seed(2023)
N = 1000
df1 = pd.DataFrame({'State': np.random.choice(['running','stopped'], size=N)},
index=pd.date_range('2000-01-01', freq='22H', periods=N))
#print (df1)
#https://stackoverflow.com/a/55317373/2901002
def shuffle_along_axis(a, axis):
idx = np.random.rand(*a.shape).argsort(axis=axis)
return np.take_along_axis(a,idx,axis=axis)
df2 = pd.DataFrame(shuffle_along_axis(np.tile(['M','N','-'], N).reshape(N, -1), axis=1),
columns=list('ABC'),index=pd.date_range('2000-01-01', freq='D', periods=N))
#print (df2)
def get_shift(timeindex):
daytime = "M" if timeindex.hour < 12 else "N"
timeindex_date = timeindex.normalize()
shifts = df2.loc[timeindex_date].where(df2.loc[timeindex_date] == daytime)
return shifts.dropna().index.item()
In [133]: %timeit df1["Shift"] = df1.index.map(get_shift)
889 ms ± 5.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [134]: %%timeit
...: (df1.reset_index()
...: .assign(dates = df1.index.floor('D'),
...: value = np.where(df1.index.hour < 12, 'M','N'))
...: .merge(df2.stack().rename_axis(('dates','Shift'))
.reset_index(name='value'), how='left')
...: .drop(['dates','value'], axis=1))
...:
9.86 ms ± 54.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Testing in 100k rows:
np.random.seed(2023)
N = 100000
df1 = pd.DataFrame({'State': np.random.choice(['running','stopped'], size=N)},
index=pd.date_range('1900-01-01', freq='22H', periods=N))
#print (df1)
#https://stackoverflow.com/a/55317373/2901002
def shuffle_along_axis(a, axis):
idx = np.random.rand(*a.shape).argsort(axis=axis)
return np.take_along_axis(a,idx,axis=axis)
df2 = pd.DataFrame(shuffle_along_axis(np.tile(['M','N','-'], N).reshape(N, -1), axis=1),
columns=list('ABC'),index=pd.date_range('1900-01-01', freq='D', periods=N))
#print (df2)
def get_shift(timeindex):
daytime = "M" if timeindex.hour < 12 else "N"
timeindex_date = timeindex.normalize()
shifts = df2.loc[timeindex_date].where(df2.loc[timeindex_date] == daytime)
return shifts.dropna().index.item()
In [2]: %timeit df1["Shift"] = df1.index.map(get_shift)
1min 35s ± 3.85 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [3]: %%timeit
...: (df1.reset_index()
...: .assign(dates = df1.index.floor('D'),
...: value = np.where(df1.index.hour < 12, 'M','N'))
...: .merge(df2.stack().rename_axis(('dates','Shift'))
...: .reset_index(name='value'), how='left')
...: .drop(['dates','value'], axis=1))
...:
195 ms ± 7.03 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)