Finding duplicate records-
I would do something like this:
1) remove characters with accent from the string.
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = nkfd_form.encode('ASCII', 'ignore')
return only_ascii
2) check if, the string with removed accent (record_1) is equal to record_2 (by using Levenshtein distance algorithm), fuzzy match.
from nltk import metrics, stem, tokenize
def normalize(s):
words = tokenize.wordpunct_tokenize(s.lower().strip())
return ' '.join([stemmer.stem(w) for w in words])
def fuzzy_match(s1, s2, max_dist=2):
return metrics.edit_distance(normalize(s1), normalize(s2)) <= max_dist