I am having a bit of trouble in terms of runtime for an algorithm that matches names with the most likely email address. The function itself works well (in that it pairs the name and email address correctly), but the runtime is so grand that it is difficult to implement on large data sets. I am a beginner at coding and would love to hear what solutions you guys might offer.
quick note I implemented Levenshtein's algorithm here. If there are more efficient algorithms, comment below!
from string import digits
import copy
import re
# levenshtein algorithm found on https://www.python-course.eu/levenshtein_distance.php
def call_counter(func):
def helper(*args, **kwargs):
helper.calls += 1
return func(*args, **kwargs)
helper.calls = 0
helper.__name__= func.__name__
return helper
def memoize(func):
mem = {}
def memoizer(*args, **kwargs):
key = str(args) + str(kwargs)
if key not in mem:
mem[key] = func(*args, **kwargs)
return mem[key]
return memoizer
@call_counter
@memoize
def levenshtein(s, t):
if s == "":
return len(t)
if t == "":
return len(s)
if s[-1] == t[-1]:
cost = 0
else:
cost = 1
res = min([levenshtein(s[:-1], t)+1,
levenshtein(s, t[:-1])+1,
levenshtein(s[:-1], t[:-1]) + cost])
return res
def emailmatch(emails_file,name_file):
name_email_match = {} #store the matching emails in a dictionary
with open(name_file, 'r') as names:
match_name = 0
for individual in names:
with open(emails_file,'r') as address_emails:
first_name = individual[:(individual.index(" "))].lower()
last_name = individual[(individual.rindex(" ")):].lower()
full_name = (first_name + last_name).lower()
full_name_period = (first_name+"."+last_name).lower()
best_match = "" #this holds the best matching email
minimum = 999
for emails in address_emails:
email = emails[0:(emails.index('@'))]
temp = min(levenshtein(last_name,email),
levenshtein(first_name,email),
levenshtein(full_name,email),
levenshtein(full_name_period,email))
if (temp < minimum):
minimum = temp
best_match = emails
name_email_match[individual] = best_match
return name_email_match
emailmatch('emails.txt', 'names.txt')