I want to update pandas
Hello, I want to compare the speeds of single-core and multicore in pandas dataframe calculations. The following cases are given, The column'c' in the 'i'th-row is the average of the values โโof 'a' from 'i-9'-row to 'i'th-row.
from multiprocessing import Process, Value, Array, Manager
import pandas as pd
import numpy as np
import time
total_num = 1000
df = pd.DataFrame(np.arange(1,total_num*2+1).reshape(total_num,2),
columns=['a','b'])
df['c']=0
df2 = pd.DataFrame(np.arange(1,total_num*2+1).reshape(total_num,2),
columns=['a','b'])
df2['c']=0
def Cal(start, end):
for i in range(end-start-1):
if i+start < 10:
df.loc[i+start,'c']=df.loc[:i+start,'c'].mean()
else :
df.loc[i+start,'c']=df.loc[i-9:i+start,'c'].mean()
def Cal2(my_df,start, end):
for i in range(end-start-1):
if i+start < 10:
my_df.df.loc[i+start,'c']=my_df.df.loc[:i+start,'c'].mean()
else :
my_df.df.loc[i+start,'c']=my_df.df.loc[i-9:i+start,'c'].mean()
print(my_df)
print('Single core : --->')
start_t = time.time()
Cal(0,total_num+1)
end_t = time.time()
print(end_t-start_t)
print('Multiprocess ---->')
if __name__=='__main__':
num=len(df2)
num_core=4
between=num//num_core
mgr=Manager()
ns = mgr.Namespace()
ns.df=df2
procs=[]
start_t =time.time()
for index in range(num_core):
proc=Process(target=Cal2,args=(ns,index*between,(index+1)*between))
procs.append(proc)
proc.start()
for proc in procs:
proc.join()
end_t = time.time()
print(end_t-start_t)
At first I realized that Multiprocessing does not use global variables. So I used Manager. However, the 'c'column of df2 did not change.
How do I do what I want to do? :p