I have put together code using Python to perform a routine task my team has. In a nutshell, we have two CSVs for managing and updating user accounts in our system. ABC.csv contains the most up-to-date data while XYZ.csv is taken from the account data of the system itself.
The purpose of the code is to compare the data between the two CSVs. When it finds that XYZ.csv has different data than that of ABC.csv, it generates update CSVs for us to upload into the system.
The code runs just fine when I use 4-5 test records. However, we typically work with over 20,000 records. When I try to run the code using these datasets, it continuously runs without ever generating errors nor results.
I even left it running for over an hour but still nothing. No error. No result.
I tried using Modin to speed up the processing but that too seems to be running continuously without generating any sort of result or error.
My questions are:
- Is this normal? Is my dataset just that large that it needs hours to run?
- Does my code need optimization or a different library that can work faster?
- The data between the two CSVs is quite asymetrical. XYZ.csv has much more records as it contains accounts that are not meant to be updated (we have no way of easily excluding the ones we don't need). Could it be that something in the data is causing a block?
import pandas as pd
import concurrent.futures
from fuzzywuzzy import fuzz
def fuzzy_match(s1, s2, threshold=85):
return fuzz.ratio(s1, s2) >= threshold
def match_rows(index, xyz_row):
match = abc[abc["Emp No"] == xyz_row["Employee"]]
abc_row = None
if not match.empty:
abc_row = match.iloc[0]
else:
# Fuzzy matching using First Name, Last Name, and Email
fuzzy_matched_rows = abc.apply(lambda x: fuzzy_match(str(xyz_row["First Name"]) + " " + str(xyz_row["Last Name"]), str(x["First Name"]) + " " + str(x["Last Name"])) and (pd.isna(xyz_row["Email"]) or fuzzy_match(str(xyz_row["Email"]), str(x["Email"]))), axis=1)
abc_fuzzy_matched = abc[fuzzy_matched_rows]
if not abc_fuzzy_matched.empty:
abc_row = abc_fuzzy_matched.iloc[0]
return xyz_row, abc_row
abc = pd.read_csv("abc.csv", low_memory=False)
xyz = pd.read_csv("xyz.csv", low_memory=False)
def is_data_different(xyz_row, abc_row):
if xyz_row["First Name"] != abc_row["First Name"]:
return True
if xyz_row["Last Name"] != abc_row["Last Name"]:
return True
if xyz_row["Employee"] != abc_row["Emp No"]:
return True
if xyz_row["User's Organisation Name(s)"] != abc_row["Org L13"]:
return True
if xyz_row["User's Job Title(s)"] != abc_row["Poslevel2"]:
return True
if xyz_row["Assignment Status"] != abc_row["Assignment Status"]:
return True
if not pd.isna(xyz_row["Email"]) and xyz_row["Email"] != abc_row["Email"]:
return True
# Add more field comparisons as needed
return False
usermatch_columns = ["idnumber", "timemodified", "username", "deleted", "firstname", "lastname", "email", "town", "auth", "customfield_hierarchypath", "customfield_orgframework", "customfield_occcode", "customfield_posno", "customfield_title", "customfield_middlename", "customfield_fte", "customfield_employee", "customfield_payscale", "customfield_assignmentstartdate", "customfield_areaofwork", "customfield_maidenname", "customfield_assignmentstatus", "customfield_ProfessionalNumber", "customfield_TrainingGrade", "customfield_PatientHandler", "customfield_WorkswithChildren", "customfield_Clinical", "customfield_Frontline"]
jobassign_columns = ["idnumber", "useridenumber", "timemodified", "deleted", "fullname", "orgidnumber", "posidnumber", 'manageridnumber']
usermatch = pd.DataFrame(columns=usermatch_columns)
jobassign = pd.DataFrame(columns=jobassign_columns)
manager_username = None
# Add userupload_columns and userupload DataFrame
userupload_columns = ["username", "firstname", "lastname", "email", "auth", "idnumber"]
userupload = pd.DataFrame(columns=userupload_columns)
manager_idnumber_dict = dict(zip(abc["Emp No"], abc["Manager Employee Number"]))
with concurrent.futures.ThreadPoolExecutor() as executor:
matched_rows = list(executor.map(match_rows, *zip(*xyz.iterrows())))
for xyz_row, abc_row in matched_rows:
if abc_row is not None:
# Check if User ID Number is empty, and use the capitalized version of the username if it is
if pd.isna(xyz_row["User ID Number"]) or xyz_row["User ID Number"] == "":
xyz_row["User ID Number"] = xyz_row["Username"].title()
manager_id = abc_row["Manager Employee Number"]
manager_username = xyz.loc[xyz["Employee"] == manager_id, "Username"]
manager_username = manager_username.iloc[0] if not manager_username.empty else None
if is_data_different(xyz_row, abc_row):
# Populate usermatch.csv
usermatch_row = {
"idnumber": xyz_row["User ID Number"],
"timemodified": 0,
"username": xyz_row["Username"],
"deleted": 0,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"town": abc_row["City"],
"auth": "manual",
"customfield_hierarchypath": f'{abc_row["Org L3"]}~{abc_row["Org Level 3"]}~{abc_row["Org L6"]}~{abc_row["Org L13"]}',
"customfield_orgframework": "UHSussex",
"customfield_occcode": abc_row["Occ Code"],
"customfield_posno": abc_row["Position No"],
"customfield_title": abc_row["Title"],
"customfield_middlename": abc_row["Middle Name"],
"customfield_fte": abc_row["FTE"],
"customfield_employee": abc_row["Emp No"],
"customfield_payscale": abc_row["Pay Scale"],
"customfield_assignmentstartdate": abc_row["Assign Start Date"],
"customfield_areaofwork": abc_row["Area Of Work"],
"customfield_maidenname": abc_row["Maiden Name"],
"customfield_assignmentstatus": abc_row["Assignment Status"],
"customfield_ProfessionalNumber": abc_row["Professional Registration Num"],
"customfield_TrainingGrade": "",
"customfield_PatientHandler": 0,
"customfield_WorkswithChildren": 0,
"customfield_Clinical": 0,
"customfield_Frontline": 0
}
usermatch = pd.concat([usermatch, pd.DataFrame([usermatch_row], columns=usermatch_columns)], ignore_index=True)
# Populate jobassign.csv
jobassign_row = {
"idnumber": abc_row["Emp No"],
"useridenumber": xyz_row["User ID Number"],
"timemodified": 0,
"deleted": 0,
"fullname": abc_row["Position Title"],
"orgidnumber": abc_row["Org L13"],
"posidnumber": abc_row["Poslevel2"],
"manageridnumber": manager_username
}
jobassign = pd.concat([jobassign, pd.DataFrame([jobassign_row], columns=jobassign_columns)], ignore_index=True)
# Populate userupload.csv
userupload_row = {
"username": xyz_row["Username"],
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"auth": "manual",
"idnumber": xyz_row["User ID Number"],
}
userupload = pd.concat([userupload, pd.DataFrame([userupload_row], columns=userupload_columns)], ignore_index=True)
def has_fuzzy_match(abc_row):
for _, xyz_row in xyz.iterrows():
if fuzzy_match(abc_row["First Name"] + " " + abc_row["Last Name"], xyz_row["First Name"] + " " + xyz_row["Last Name"]) and (pd.isna(abc_row["Email"]) or fuzzy_match(abc_row["Email"], xyz_row["Email"])):
return True
return False
# Step 1: Create a new DataFrame called newusers with the same columns as userupload.csv
newusers_columns = ["username", "firstname", "lastname", "email", "auth", "idnumber"]
newusers = pd.DataFrame(columns=newusers_columns)
# Step 2: Iterate through the rows of abc.csv and check if each row exists in xyz.csv
for _, abc_row in abc.iterrows():
if abc_row["Emp No"] not in xyz["Employee"].values and not has_fuzzy_match(abc_row):
# Step 3: Generate a username from the email address and populate the newusers.csv file
username = abc_row["Email"].split('@')[0] if pd.notna(abc_row["Email"]) else ""
newusers_row = {
"username": username,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": abc_row["Email"],
"auth": "manual",
"idnumber": username.title(),
}
newusers = pd.concat([newusers, pd.DataFrame([newusers_row], columns=userupload_columns)], ignore_index=True)
# Add new user data to usermatch and jobassign DataFrames
manager_id = abc_row["Manager Employee Number"]
manager_username = xyz.loc[xyz["Employee"] == manager_id, "Username"]
manager_username = manager_username.iloc[0] if not manager_username.empty else None
usermatch_row = {
"idnumber": username.title(),
"timemodified": 0,
"username": xyz_row["Username"],
"deleted": 0,
"firstname": abc_row["First Name"],
"lastname": abc_row["Last Name"],
"email": xyz_row["Email"],
"town": abc_row["City"],
"auth": "manual",
"customfield_hierarchypath": f'{abc_row["Org L3"]}~{abc_row["Org Level 3"]}~{abc_row["Org L6"]}~{abc_row["Org L13"]}',
"customfield_orgframework": "UHSussex",
"customfield_occcode": abc_row["Occ Code"],
"customfield_posno": abc_row["Position No"],
"customfield_title": abc_row["Title"],
"customfield_middlename": abc_row["Middle Name"],
"customfield_fte": abc_row["FTE"],
"customfield_employee": abc_row["Emp No"],
"customfield_payscale": abc_row["Pay Scale"],
"customfield_assignmentstartdate": abc_row["Assign Start Date"],
"customfield_areaofwork": abc_row["Area Of Work"],
"customfield_maidenname": abc_row["Maiden Name"],
"customfield_assignmentstatus": abc_row["Assignment Status"],
"customfield_ProfessionalNumber": abc_row["Professional Registration Num"],
"customfield_TrainingGrade": "",
"customfield_PatientHandler": 0,
"customfield_WorkswithChildren": 0,
"customfield_Clinical": 0,
"customfield_Frontline": 0
}
usermatch = pd.concat([usermatch, pd.DataFrame([usermatch_row], columns=usermatch_columns)], ignore_index=True)
jobassign_row = {
"idnumber": abc_row["Emp No"],
"useridenumber": username.title(),
"timemodified": 0,
"deleted": 0,
"fullname": abc_row["Position Title"],
"orgidnumber": abc_row["Org L13"],
"posidnumber": abc_row["Poslevel2"],
# other fields from the original code
"manageridnumber": manager_username
}
jobassign = pd.concat([jobassign, pd.DataFrame([jobassign_row], columns=jobassign_columns)], ignore_index=True)
# Save the output CSVs
usermatch.to_csv("usermatch.csv", index=False)
jobassign.to_csv("jobassign.csv", index=False)
userupload.to_csv("userupload.csv", index=False)
newusers.to_csv("newusers.csv", index=False)