0

What explains the difference in time to process column renaming in a Python dataframe? At first, I used the code df=df.rename(columns = {'colold':'colnew'}). This proved to be inefficient. I modified the code and used df.rename(columns = {'colold':'colnew'}) which reduced the processing time. While it seems obvious that the later approach would take less time as it reduces a step, why is this the case. In the following code, the difference between both approaches is about 30 seconds.

 import time
   import numpy as np
   import random
   import pandas as pd


   def fun_user_id(start, end, step):
    num = np.linspace(start, end,(end-start)
                      *int(1/step)+1).tolist()
    return [round(i, 0) for i in num]

   def fun_rand_num():
    return list(map(lambda x: random.randint(300,800), range(1, 20000001)))

   if __name__=='__main__':
    userid=fun_user_id(1,20000001,.5)
    var1=fun_rand_num()
    var2=fun_rand_num()
    var3=fun_rand_num()
    var4=fun_rand_num()
    var5=fun_rand_num()
    var6=fun_rand_num()
    var7=fun_rand_num()
    var8=fun_rand_num()
    var9=fun_rand_num()
    var10=fun_rand_num()
    var1a=fun_rand_num()
    var2a=fun_rand_num()
    var3a=fun_rand_num()
    var4a=fun_rand_num()
    var5a=fun_rand_num()
    var6a=fun_rand_num()
    var7a=fun_rand_num()
    var8a=fun_rand_num()
    var9a=fun_rand_num()
    var10a=fun_rand_num()
    
    
    df = pd.DataFrame(list(zip(userid,var1, var2,var3,var4,var5,var6,var7,var8,var9,var10,var1a,var2a,var3a,var4a,var5a,var6a,var7a,var8a,var9a,var10a)),
                   columns =['userid','var1', 'var2','var3','var4','var5',
                             'var6', 'var7','var8','var9','var10',
                             'var1a', 'var2a','var3a','var4a','var5a',
                             'var6a', 'var7a','var8a','var9a','var10a'])

    start1 = time.time()
   
    df=df.rename(columns = {'var1':'VAR1'})
    df=df.rename(columns = {'var2':'VAR2'})
    df=df.rename(columns = {'var3':'VAR3'})
    df=df.rename(columns = {'var4':'VAR4'})
    df=df.rename(columns = {'var5':'VAR5'})
    df=df.rename(columns = {'var1a':'VAR1a'})
    df=df.rename(columns = {'var2a':'VAR2a'})
    df=df.rename(columns = {'var3a':'VAR3a'})
    df=df.rename(columns = {'var4a':'VAR4a'})
    df=df.rename(columns = {'var5a':'VAR5a'})
    end1= time.time()
    print(f"Elapsed time rename df reassigned : {end1 - start1:.2f} seconds")

    start2 = time.time()
    df.rename(columns = {'var6':'VAR6'})
    df.rename(columns = {'var7':'VAR7'})
    df.rename(columns = {'var8':'VAR8'})
    df.rename(columns = {'var9':'VAR9'})
    df.rename(columns = {'var10':'VAR10'})
    df.rename(columns = {'var6a':'VAR6a'})
    df.rename(columns = {'var7a':'VAR7a'})
    df.rename(columns = {'var8a':'VAR8a'})
    df.rename(columns = {'var9a':'VAR9a'})
    df.rename(columns = {'var10a':'VAR10a'})

    end2= time.time()
    print(f"Elapsed time rename df not reassigned : {end2 - start2:.2f} seconds")

    print(f"Time Difference : {(end2 - start2)-(end1 - start1):.2f} seconds")

1 Answers1

0

df.rename() without reassignment doesn't change the column name, You need to use df.rename(columns={'col':'new_col'}, inplace=True) and test the time taken

  • additionally, you may read about inplace parameter and argument against using it or not. There are certain posts one of which im linking here - https://stackoverflow.com/questions/45570984/in-pandas-is-inplace-true-considered-harmful-or-not – sastaengineer Jul 10 '23 at 14:49