I am trying to find a way to parallelise certain operations on dataframes, especially those that cannot be vectorised.
I have tested the code below, taken from http://www.racketracer.com/2016/07/06/pandas-in-parallel/ , but it doesn't work. No error message - quite simply, nothing happens. Debugging it, it seems the code gets stuck at df = pd.concat(pool.map(func, df_split))
, but without any error messages.
What am I doing wrong?
import timeit
import pandas as pd
import numpy as np
import seaborn as sns
import multiprocessing
from multiprocessing import Pool
def parallelize_dataframe(df, func):
df_split = np.array_split(df, num_partitions)
pool = Pool(num_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
def multiply_columns(data):
data['length_of_word'] = data['species'].apply(lambda x: len(x))
return data
num_partitions = 2 #number of partitions to split dataframe
num_cores = 2# multiprocessing.cpu_count() #number of cores on your machine
iris = pd.DataFrame(sns.load_dataset('iris'))
iris = parallelize_dataframe(iris, multiply_columns)