Indeed not very straight forward. I would go over dataframe 1 row by row, and find a suitable match (based on the criteria) in dataframe 2. If there are multiple candidates you'll have to decide how to pick the best one (maybe just the first match), and then combine the match with the original row.
import random
import pandas
import datetime
# Generate some test dataframes
df_1 = pandas.DataFrame([{
"termination_date": datetime.datetime.fromtimestamp(random.randint(1605865919, 1606297560)),
"creation_date": datetime.datetime.fromtimestamp(random.randint(1605865919, 1606297560)),
"smth1": random.randint(0, 1),
"smth2": random.randint(0, 1),
} for _ in range(1000)])
df_2 = pandas.DataFrame([{
"termination_date": datetime.datetime.fromtimestamp(random.randint(1605865919, 1606297560)),
"creation_date": datetime.datetime.fromtimestamp(random.randint(1605865919, 1606297560)),
"smth1": random.randint(0, 1),
"smth2": random.randint(0, 1),
} for _ in range(1000)])
def try_to_join(row: pandas.Series, other: pandas.DataFrame, suffix: str = "_right") -> pandas.Series:
""" Try to join based on complex conditions.
Args:
row (pandas.Series): The row to join (left)
other (pandas.DataFrame): The dataframe to join with (right)
suffix (str, optional): A suffix that will be put on the columns of the "other". Defaults to "_right".
Returns:
pandas.Series: The joined series
"""
# Define the conditions
condition_1 = row["termination_date"] - other["creation_date"] < pandas.Timedelta(24, unit="H")
condition_2 = row["termination_date"] - other["creation_date"] > pandas.Timedelta(-24, unit="H")
condition_3 = row["smth1"] == other["smth1"]
condition_4 = row["smth2"] != other["smth2"]
# Find the rows in "other" that fullfill the conditions
matches = other.loc[condition_1 & condition_2 & condition_3 & condition_4, :]
# Return the original row, if no match was found
if matches.shape[0] == 0:
return row
# TODO: Decide how to handle multiple matches (pick the first match for now)
match = matches.iloc[0]
# Add a suffix to the "other" columns
match.index = [f"{col}{suffix}" for col in match.index]
# Add the fields from the match to the original row
for col in match.index:
row[col] = match[col]
# Return the new row
return row
# Use the function above to join the 2 dataframes into a new dataframe
df_joined = df_1.apply(lambda row: try_to_join(row, other=df_2), axis=1)
Output:
creation_date creation_date_right smth1 smth1_right smth2 \
0 2020-11-24 02:10:35 2020-11-24 07:08:40 1 1.0 0
1 2020-11-21 23:46:28 NaT 0 NaN 1
2 2020-11-22 21:54:58 2020-11-22 13:27:57 0 0.0 0
3 2020-11-23 18:45:59 2020-11-20 22:58:10 1 1.0 0
4 2020-11-20 13:24:18 2020-11-23 06:18:23 1 1.0 0
.. ... ... ... ... ...
995 2020-11-25 10:40:19 2020-11-24 07:08:40 1 1.0 0
996 2020-11-24 19:27:47 2020-11-24 23:21:32 0 0.0 0
997 2020-11-23 10:18:30 2020-11-24 07:08:40 1 1.0 0
998 2020-11-20 15:54:53 2020-11-24 23:21:32 0 0.0 0
999 2020-11-22 14:19:45 2020-11-24 23:21:32 0 0.0 0
smth2_right termination_date termination_date_right
0 1.0 2020-11-25 01:59:37 2020-11-22 03:38:51
1 NaN 2020-11-23 22:55:26 NaT
2 1.0 2020-11-22 12:47:03 2020-11-24 12:32:50
3 1.0 2020-11-21 18:49:48 2020-11-22 16:16:23
4 1.0 2020-11-22 12:10:14 2020-11-24 00:58:23
.. ... ... ...
995 1.0 2020-11-23 08:04:51 2020-11-22 03:38:51
996 1.0 2020-11-24 09:48:11 2020-11-23 13:43:11
997 1.0 2020-11-24 06:45:34 2020-11-22 03:38:51
998 1.0 2020-11-25 02:19:43 2020-11-23 13:43:11
999 1.0 2020-11-25 09:41:06 2020-11-23 13:43:11
[1000 rows x 8 columns]
You'll notice that when no match is found, the fields for _right
will be empty.