Performance problem when using pandas apply on big dataframes

Question

Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?

def my_function_1(semi_dict, row):
    #do some calculation/other stuff based on the row data and append it to the dictionary
    random_dict = dict(data=some_data, more_data=more_data)
    semi_dict["data"].append(random_dict)
   
def my_function_2(semi_dict, row):
    #do some calculation/other stuff based on the row data and append it to the dictionary
    random_dict = dict(data=some_data, more_data=more_data)
    semi_dict["data2"].append(random_dict)


dictionary_list = []

for v in values:
   
   df_1_rows = df_1_rows[(df_1_rows.values == v)]
   df_2_rows = df_2_rows[(df_2_rows.values == v)]
   
   semi_dict = dict(value=v, data=[], data2=[])

   function = partial(my_function_1, semi_dict)
   function_2 = partial(my_function_2, semi_dict)
   df_1_rows.apply(lambda row : function(row), axis=1)
   df_2_rows.apply(lambda row : function_2(row), axis=1)

   dictionary_list.append(semi_dict)

score 0 · Answer 1 · answered Feb 14 '21 at 15:31

This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:

import pandas as pd
import random

len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")

df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])

def make_dict(df):
    # some calculations on the df
    return {
        'data': df.head(1).values.tolist(),
    }

def make_dict_2(df):
    # some calculations on the df
    return {
        'data_2': df.head(1).values.tolist(),
    }

def merge(a, b, path=None):
    "merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
    if path is None: path = []
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                merge(a[key], b[key], path + [str(key)])
            elif a[key] == b[key]:
                pass # same leaf value
            else:
                raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
        else:
            a[key] = b[key]
    return a

dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()

result = merge(dict1, dict2)

result

Performance problem when using pandas apply on big dataframes

1 Answers1