Example code base on the this similarity metric:
import pandas as pd
from difflib import SequenceMatcher
import numpy as np
import re
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def remove_prefix(s):
return re.split('\.| |_|-', s)[-1]
# Mimic dataframe
d = {'Names1': ['Mr.Reven', 'Freddie', 'Miss.Grey', 'James', 'Neoveeen', 'Boult', 'Dr.Alen', 'Alsray'],
'Names2': ['Alex', 'Keven', 'Moeen', 'Shayne', 'Frey', 'mcKay', 'Adames', 'Miss. Slout']}
df = pd.DataFrame(d)
# Get two list names with remove prefix
remove_prefix_fv = np.vectorize(remove_prefix)
names1 = remove_prefix_fv(df['Names1'].to_numpy())
names2 = remove_prefix_fv(df['Names2'].to_numpy())
# Get similarity scores for each pairs between Names1 and Names2
similar_fv = np.vectorize(similar)
scores = similar_fv(names1[:, np.newaxis], names2)
# Filter out the pairs above the threshold
threshold = 0.7
ind = np.where(scores >= threshold)
# Cluster the Names2 elements with same Names1 element
uc = np.unique(ind[0])
cd = {"Cluster-" + str(i): [names1[uc[i]]] + list(names2[ind[1][np.where(ind[0] == uc[i])[0]]]) for i in range(len(uc))}
# Build the dataframe
cdf = pd.DataFrame(cd)
print(cdf)
Outputs:
Cluster-0 Cluster-1 Cluster-2 Cluster-3
0 Reven Grey James Alen
1 Keven Frey Adames Alex