I would like to parallelize following function, where A and B are just some columns from my input data frame. I would like to provide the output dictionary as an input so it is filled within the function (pass by reference)
outdict = {}
inputdict1 = {'id1': 100, 'id2': 200, 'id3': 0}
inputdict2 = {'id1': ['cat'], 'id2': ['dog', 'rabbit'], 'id3': []}
inputdf = pd.DataFrame({'A': ['cat', 'cat', 'dog', 'dog', 'dog', 'cat', 'rabbit', 'rabbit'],
'B': ['a', 'b', 'b', 'c', 'c', 'd', 'e', 'f']})
def processing(outdict, inputdict1, inputdict2, inputdf):
for key, _ in tqdm(inputdict1.items()):
outdict[key] = inputdf[inputdf.A.isin(inputdict2[key])].B.nunique()
processing(outdict, inputdict1, inputdict2, inputdf)
print(outdict)
{'id1': 3, 'id2': 4, 'id3': 0}
After some research I have tried the following approach
from multiprocessing import Pool
def processing(outdict, inputdict1, inputdict2, inputdf):
for key, value in tqdm(inputdict1.items()):
outdict[key] = inputdf[inputdf.A.isin(inputdict2[key])].B.nunique()
outdict = {}
pool = Pool()
pool.starmap(processing, zip(outdict, inputdict1, inputdict2, inputdf))
print(outdict)
{}