I came up with a solution. I'm using groupby and then check for each group if I actually want to deduplicate it or not. It seems highly inefficient but it does the trick. If someone can come up with a better solution, I will gladly accept it.
First, we add columns to count the number of unique IDs per group, and columns containing a boolean indicating if for each row, there is an ID or not.
And lastly, a count of number of IDs per row (useful for sorting the dataframe later).
df["first_id_count"] = df.groupby(["name", "city"])["my_first_id"].transform('nunique')
df["second_id_count"] = df.groupby(["name", "city"])["my_second_id"].transform('nunique')
def check_if_id_is_present(x):
try:
if not(pd.isnull(x)):
return True
else:
return False
except:
return False
df["my_first_id_present"] = df["my_first_id"].apply(check_if_id_is_present)
df["my_second_id_present"] = df["my_second_id"].apply(check_if_id_is_present)
def create_count_ids_per_row(x):
try:
count = 0
if not(pd.isnull(x[0])):
count += 1
if not(pd.isnull(x[1])):
count += 1
return count
except:
return 0
df["ids_count"] = df[["my_first_id", "my_second_id"]].apply(create_count_ids_per_row, axis=1)
Then, we can start the groupby and iterate over each group.
df_final = pd.DataFrame()
ids_to_deduplicate = ["first_id_count", "second_id_count"]
ids_present = ["my_first_id_present", "my_second_id_present"]
for name, group in grouped:
if group["first_id_count"].iloc[0] < 2 and group["second_id_count"].iloc[0] < 2:
# if there are strictly less than 2 unique ids of my_first_id and my_second_id
# then we can safely deduplicate and add to the final dataframe
df_final = pd.concat([df_final, group.groupby(["name", "city"]).first().reset_index()])
else:
# if not, we have to separate the dataframe into 2
# one we want to deduplicate
# one we shouldn't touch
df_duplicate = group.copy()
# first, we sort by the number of ids per row
df_duplicate = df_duplicate.sort_values(by=["ids_count"], ascending=False)
# and reset the index, for ease of use
df_duplicate.reset_index(inplace=True)
df_duplicate = df_duplicate.drop("index", axis=1)
# rows we want to deduplicate
rows_to_deduplicate = []
# rows we want to keep
rows_to_keep = []
# create a list with flags for each id column
flags_list = [False]*len(ids_to_deduplicate)
for idx in df_duplicate.index:
flag = False
# first, we check if one of our flags is set to True at the same time as our row in df_duplicate
# in this case, it means that we already have a row with an id, and we have have another row with another id
# so we want to keep this row to not lose any information
for idx_id, my_id in enumerate(ids_to_deduplicate):
if flags_list[idx_id] and df_duplicate.loc[idx, ids_present[idx_id]]:
# we add it to the rows to keep
rows_to_keep.append(idx)
flag = True
break
if flag:
continue
# now, we know that we want to deduplicate this row, otherwise we would have flaged this row
for idx_id, my_id in enumerate(ids_to_deduplicate):
if not(flags_list[idx_id]) and df_duplicate.loc[idx, ids_present[idx_id]]:
# we add it to the rows to deduplicate
rows_to_deduplicate.append(idx)
# we have to add to the flags_list all the according booleans
# there can be several ids on a row so we have to make a for loop
for idx_id_temp, id_to_check in enumerate(ids_to_deduplicate):
if df_duplicate.loc[idx, ids_present[idx_id_temp]]:
flags_list[idx_id_temp] = True
break
# now we have our 2 separate dataframes
df_duplicate_keep = df_duplicate.loc[rows_to_keep].copy()
df_duplicate_deduplicate = df_duplicate.loc[rows_to_deduplicate].copy()
# we can keep one, and deduplicate the other and concatenate the result
df_final_duplicate = pd.concat([df_duplicate_keep, df_duplicate_deduplicate.groupby(["name", "city"]).first().reset_index()])
# and add the result to our final dataframe
df_final = pd.concat([df_final, df_final_duplicate])
And to clean our mess:
df_final = df_final.drop("ids_count", axis=1)
for col in ids_to_deduplicate:
df_final = df_final.drop(col, axis=1)
for col in ids_present:
df_final = df_final.drop(col, axis=1)
And we have the desired output.
Again, this seems really ugly, so if anyone has a better solution, feel free to share.