Here's a variation based on np.searchsorted
that seems to be an order of magnitude faster than using intervaltree
or merge
, assuming my larger sample data is correct.
# Ensure the df2 is sorted (skip if it's already known to be).
df2 = df2.sort_values(by=['start_date', 'end_date'])
# Add the end of the time interval to df1.
df1['date_end'] = df1['date'] + pd.DateOffset(minutes=9, seconds=59)
# Perform the searchsorted and get the corresponding df2 values for both endpoints of df1.
s1 = df2.reindex(np.searchsorted(df2['start_date'], df1['date'], side='right')-1)
s2 = df2.reindex(np.searchsorted(df2['start_date'], df1['date_end'], side='right')-1)
# Build the conditions that indicate an overlap (any True condition indicates an overlap).
cond = [
df1['date'].values <= s1['end_date'].values,
df1['date_end'].values <= s2['end_date'].values,
s1.index.values != s2.index.values
]
# Filter df1 to only the overlapping intervals, and drop the extra 'date_end' column.
df1 = df1[np.any(cond, axis=0)].drop('date_end', axis=1)
This may need to be modified if the intervals in df2
are nested or overlapping; I haven't fully thought it through in that scenario, but it may still work.
Using an Interval Tree
Not quite a pure Pandas solution, but you may want to consider building an Interval Tree from df2
, and querying it against your intervals in df1
to find the ones that overlap.
The intervaltree
package on PyPI seems to have good performance and easy to use syntax.
from intervaltree import IntervalTree
# Build the Interval Tree from df2.
tree = IntervalTree.from_tuples(df2.astype('int64').values + [0, 1])
# Build the 10 minutes spans from df1.
dt_pairs = pd.concat([df1['date'], df1['date'] + pd.offsets.Minute(10)], axis=1)
# Query the Interval Tree to filter df1.
df1 = df1[[tree.overlaps(*p) for p in dt_pairs.astype('int64').values]]
I converted the dates to their integer equivalents for performance reasons. I doubt the intervaltree
package was built with pd.Timestamp
in mind, so there probably some intermediate conversion steps that slow things down a bit.
Also, note that intervals in the intervaltree
package do not include the end point, although the start point is included. That's why I have the + [0, 1]
when creating tree
; I'm padding the end point by a nanosecond to make sure the real end point is actually included. It's also the reason why it's fine for me to add pd.offsets.Minute(10)
to get the interval end when querying the tree, instead of adding only 9m 59s.
The resulting output for either method:
date value
0 2016-11-24 00:00:00 1759.199951
1 2016-11-24 00:10:00 992.400024
6 2016-11-24 01:00:00 82.499999
7 2016-11-24 01:10:00 37.400003
8 2016-11-24 01:20:00 159.899994
Timings
Using the following setup to produce larger sample data:
# Sample df1.
n1 = 55000
df1 = pd.DataFrame({'date': pd.date_range('2016-11-24', freq='10T', periods=n1), 'value': np.random.random(n1)})
# Sample df2.
n2 = 500
df2 = pd.DataFrame({'start_date': pd.date_range('2016-11-24', freq='18H22T', periods=n2)})
# Randomly shift the start and end dates of the df2 intervals.
shift_start = pd.Series(np.random.randint(30, size=n2)).cumsum().apply(lambda s: pd.DateOffset(seconds=s))
shift_end1 = pd.Series(np.random.randint(30, size=n2)).apply(lambda s: pd.DateOffset(seconds=s))
shift_end2 = pd.Series(np.random.randint(5, 45, size=n2)).apply(lambda m: pd.DateOffset(minutes=m))
df2['start_date'] += shift_start
df2['end_date'] = df2['start_date'] + shift_end1 + shift_end2
Which yields the following for df1
and df2
:
df1
date value
0 2016-11-24 00:00:00 0.444939
1 2016-11-24 00:10:00 0.407554
2 2016-11-24 00:20:00 0.460148
3 2016-11-24 00:30:00 0.465239
4 2016-11-24 00:40:00 0.462691
...
54995 2017-12-10 21:50:00 0.754123
54996 2017-12-10 22:00:00 0.401820
54997 2017-12-10 22:10:00 0.146284
54998 2017-12-10 22:20:00 0.394759
54999 2017-12-10 22:30:00 0.907233
df2
start_date end_date
0 2016-11-24 00:00:19 2016-11-24 00:41:24
1 2016-11-24 18:22:44 2016-11-24 18:36:44
2 2016-11-25 12:44:44 2016-11-25 13:03:13
3 2016-11-26 07:07:05 2016-11-26 07:49:29
4 2016-11-27 01:29:31 2016-11-27 01:34:32
...
495 2017-12-07 21:36:04 2017-12-07 22:14:29
496 2017-12-08 15:58:14 2017-12-08 16:10:35
497 2017-12-09 10:20:21 2017-12-09 10:26:40
498 2017-12-10 04:42:41 2017-12-10 05:22:47
499 2017-12-10 23:04:42 2017-12-10 23:44:53
And using the following functions for timing purposes:
def root_searchsorted(df1, df2):
# Add the end of the time interval to df1.
df1['date_end'] = df1['date'] + pd.DateOffset(minutes=9, seconds=59)
# Get the insertion indexes for the endpoints of the intervals from df1.
s1 = df2.reindex(np.searchsorted(df2['start_date'], df1['date'], side='right')-1)
s2 = df2.reindex(np.searchsorted(df2['start_date'], df1['date_end'], side='right')-1)
# Build the conditions that indicate an overlap (any True condition indicates an overlap).
cond = [
df1['date'].values <= s1['end_date'].values,
df1['date_end'].values <= s2['end_date'].values,
s1.index.values != s2.index.values
]
# Filter df1 to only the overlapping intervals, and drop the extra 'date_end' column.
return df1[np.any(cond, axis=0)].drop('date_end', axis=1)
def root_intervaltree(df1, df2):
# Build the Interval Tree.
tree = IntervalTree.from_tuples(df2.astype('int64').values + [0, 1])
# Build the 10 minutes spans from df1.
dt_pairs = pd.concat([df1['date'], df1['date'] + pd.offsets.Minute(10)], axis=1)
# Query the Interval Tree to filter the DataFrame.
return df1[[tree.overlaps(*p) for p in dt_pairs.astype('int64').values]]
def ptrj(df1, df2):
# The smallest amount of time - handy when using open intervals:
epsilon = pd.Timedelta(1, 'ns')
# Lookup series (`asof` works best with series) for `start_date` and `end_date` from `df2`:
sdate = pd.Series(data=range(df2.shape[0]), index=df2.start_date)
edate = pd.Series(data=range(df2.shape[0]), index=df2.end_date + epsilon)
# (filling NaN's with -1)
l = edate.asof(df1.date).fillna(-1)
r = sdate.asof(df1.date + (pd.Timedelta(10, 'm') - epsilon)).fillna(-1)
# (taking `values` here to skip indexes, which are different)
mask = l.values < r.values
return df1[mask]
def parfait(df1, df2):
df1['key'] = 1
df2['key'] = 1
df2['row'] = df2.index.values
# CROSS JOIN
df3 = pd.merge(df1, df2, on=['key'])
# DF FILTERING
return df3[df3['start_date'].between(df3['date'], df3['date'] + dt.timedelta(minutes=9, seconds=59), inclusive=True) | df3['date'].between(df3['start_date'], df3['end_date'], inclusive=True)].set_index('date')[['value', 'row']]
def root_searchsorted_modified(df1, df2):
# Add the end of the time interval to df1.
df1['date_end'] = df1['date'] + pd.DateOffset(minutes=9, seconds=59)
# Get the insertion indexes for the endpoints of the intervals from df1.
s1 = df2.reindex(np.searchsorted(df2['start_date'], df1['date'], side='right')-1)
s2 = df2.reindex(np.searchsorted(df2['start_date'], df1['date_end'], side='right')-1)
# ---- further is the MODIFIED code ----
# Filter df1 to only overlapping intervals.
df1.query('(date <= @s1.end_date.values) |\
(date_end <= @s1.end_date.values) |\
(@s1.index.values != @s2.index.values)', inplace=True)
# Drop the extra 'date_end' column.
return df1.drop('date_end', axis=1)
I get the following timings:
%timeit root_searchsorted(df1.copy(), df2.copy())
100 loops best of 3: 9.55 ms per loop
%timeit root_searchsorted_modified(df1.copy(), df2.copy())
100 loops best of 3: 13.5 ms per loop
%timeit ptrj(df1.copy(), df2.copy())
100 loops best of 3: 18.5 ms per loop
%timeit root_intervaltree(df1.copy(), df2.copy())
1 loop best of 3: 4.02 s per loop
%timeit parfait(df1.copy(), df2.copy())
1 loop best of 3: 8.96 s per loop