I used below a bit messy but does it Faster.
Thanks to Anurag Dabas for the help.
I merge df on its own and check the overlaps then remove extra rows, do this till there is no overlaps left.
Please note that I added all "shipNo".
import pandas as pd
import numpy as np
df = pd.DataFrame([['A','2011-02-07','2011-02-22',1],['A','2011-02-14','2011-03-10',2],['A','2011-03-07','2011-03-15',3],['A','2011-03-18','2011-03-25',4]], columns = ['Cust','startDate','endDate','shipNo'])
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])
def overlap_checker(data):
data['CuststartDateendDate']=data['Cust'].map(str)+data['startDate'].map(str)+data['endDate'].map(str)
df2=pd.merge(data,data,on='Cust')
df2['Overlap']=np.where((df2['startDate_x']<=df2['endDate_y'])&(df2['endDate_x']>=df2['startDate_y']) & (df2['CuststartDateendDate_x'] != df2['CuststartDateendDate_y']), 'Overlapped','not overlapped')
df2['startDate_x']=np.where(df2['Overlap'].eq('Overlapped'),df2[['startDate_x','startDate_y']].min(axis=1),df2['startDate_x'])
df2['endDate_x']=np.where(df2['Overlap'].eq('Overlapped'),df2[['endDate_x','endDate_y']].max(axis=1),df2['endDate_x'])
df2['shipNo']=df2['shipNo_x'].map(str)+df2['shipNo_y'].map(str)
df2['shipNo'] = df2['shipNo'].apply(lambda x: ' '.join(sorted(set(x))))
df2.rename(columns = {'startDate_x':'startDate','endDate_x':'endDate'}, inplace = True)
return df2, data
def overlap_remover(df, data):
df2= df[(df['Overlap']=="Overlapped")]
data1=data[~data['CuststartDateendDate'].isin(df2['CuststartDateendDate_x'])]
df2 = df2.drop(columns=['startDate_y','endDate_y','Overlap','CuststartDateendDate_x','CuststartDateendDate_y','shipNo_x','shipNo_y'])
df2 = df2.drop_duplicates()
bigdata = data1.append(df2, ignore_index=True,sort=False)
return bigdata
dftmp, data = overlap_checker(df)
while dftmp['Overlap'].str.contains('Overlapped').any():
df = overlap_remover(dftmp,data)
dftmp, data = overlap_checker(df)
df = df.drop(columns=['CuststartDateendDate'])
df = df[['Cust','startDate','endDate','shipNo']]
print(df)