I have got a script below that check the accuracy of a column of addresses in my dataframe against a column of addresses in another dataframe, to see if they match and how well they match.
my main dataframe contains about 3 million records (business_rates.csv), and reference dataframe (all_food_hygiene_data_clean_up.csv) contains about 10,000 records. I am getting this error when I process the match
ERROR: Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
I think is due to running out of memory. Can someone tell me how to resolve exit code 137 ?
import pandas as pd
from rapidfuzz import process, fuzz
from itertools import islice
import time
from dask import dataframe as dd
ref_df = pd.read_csv('all_food_hygiene_data_clean_up.csv')
df = dd.read_csv('business_rates.csv', low_memory=False)
contacts_addresses = list(df.address)
ref_addresses = list(ref_df.ref_address.unique())
post_code = list(ref_df.post_code)
scores_list = []
names = []
start = time.time()
print("start time:", time.ctime(start))
chunk_size = 1000
ref_addr_iter = iter(ref_addresses)
while ref_addr_chunk := list(islice(ref_addr_iter, chunk_size)):
scores = process.cdist(ref_addr_chunk, contacts_addresses, scorer=fuzz.token_sort_ratio, score_cutoff=0, workers=-1)
max_scores_idx = scores.argmax(axis=1)
print('post_code', len(post_code))
print('max_scores_idx', len(max_scores_idx))
for ref_addr_idx, score_idx in enumerate(max_scores_idx):
names.append((ref_addr_chunk[ref_addr_idx], contacts_addresses[score_idx]))
scores_list.append(scores[ref_addr_idx, score_idx])
end = time.time()
print("end time:", time.ctime(end))
name_dict = dict(names)
match_df = pd.DataFrame(name_dict.items(), columns=['ref_address', 'matched_address'])
scores_df = pd.DataFrame(scores_list)
merged_results_01 = pd.concat([match_df, scores_df], axis=1)
merged_results_01.to_csv('merged_results_01.csv')
merged_results_02 = pd.merge(ref_df, merged_results_01, how='right', on='ref_address')
merged_results_02.to_csv('results.csv', mode='a', index=False)