You could also use fuzzywuzzy
to compute the Levensthein distance, even if there are more than two "similar" values
For example
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
DF = pd.DataFrame({'Code':['abc', 'abc', 'abc', 'abc', 'def', 'def', 'def', 'abc'],
'Description':['ABC String', 'ABC String',
'ABC String and sth', 'Only sth else',
'ABC String', 'CDEFGH', 'CDEFGH and sth',
'CDEFGH and sth',],
'Value':[10, 20, 30, 40, 50, 60, 70, 80]})
# for each unique value in Description
for d in DF.Description.unique():
# compute Levensthein distance
# and set to True if >= a limit
# (you may have to play around with it)
DF[d] = DF['Description'].apply(
lambda x : fuzz.ratio(x, d) >= 60
)
# set a name for the group
# here, simply the shortest
m = np.min(DF[DF[d]==True].Description)
# assign the group
DF.loc[DF.Description==d, 'group'] = m
print(DF)
Code Description Value ABC String group \
0 abc ABC String 10 True ABC String
1 abc ABC String 20 True ABC String
2 abc ABC String and sth 30 True ABC String
3 abc Only sth else 40 False Only sth else
4 def ABC String 50 True ABC String
5 def CDEFGH 60 False CDEFGH
6 def CDEFGH and sth 70 False CDEFGH
7 abc CDEFGH and sth 80 False CDEFGH
ABC String and sth Only sth else CDEFGH CDEFGH and sth
0 True False False False
1 True False False False
2 True False False False
3 False True False False
4 True False False False
5 False False True True
6 False False True True
7 False False True True
And now you can groupby
the created groups
DF.groupby('group').Value.mean()
group
ABC String 27.5
CDEFGH 70.0
Only sth else 40.0
Name: Value, dtype: float64