I'm trying to make a calculation on multiple rows for every row in a dataframe.
My current solution takes so much time when I run 2971000 rows. it almost takes more than 2hours.
So, I want know other solutions to speed up a function
my data looks like this for example.
sig1 sig2 sig3 sig4 sig_p sig_t
20210114 05:52:02.00 0.0 0.0 0.0 0.0 11.5 -3.5
20210114 05:52:02.01 0.0 0.0 0.0 0.0 11.6 -3.5
20210114 05:52:02.02 0.0 0.0 0.0 0.0 11.5 -3.5
20210114 05:52:02.03 0.0 0.0 0.0 0.0 11.6 -3.5
20210114 05:52:02.04 0.0 0.0 0.0 0.0 11.7 -3.5
... ... ... ... ... ... ...
20210114 22:38:59.85 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.86 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.87 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.88 0.0 0.0 0.0 0.0 0.0 -0.5
20210114 22:38:59.89 0.0 0.0 0.0 0.0 0.0 -0.5
I have a function which loops through and calculates value for newcol
based on sig1
, sig_p
, sig_t
,previous newcol
. the function runs repeat for sig1
, sig2
, sig3
, sig4
.
I'll show you the code I currently have, but it's too slow.
parameter.py
from typing import NamedTuple
class Param(NamedTuple):
RATIO : float
D : float
T : float
M : float
S : float
W : float
DYNAMIC : float
T_CONST : float
P_CONST : float
L_COEF : float
O_COEF : float
@property
def A(self):
return (self.D**2)*math.pi
@property
def FACTOR(self):
return self.S / self.A
Param1 = Param(
RATIO = 0.74,
D = 172e-3,
T = 23e-3,
M = 6,
S = 53.7e-4,#4232.5e-6,
W = 0.805,
DYNAMIC = 0.3150,
T_CONST = 2, #4,
P_CONST = 0.2,#3,
L_COEF = 0.8,#4,
O_COEF = 2.5
)
rear = Param(
RATIO = 0.26,
D = 204e-3,
T = 10e-3,
M = 4,
S = 26.8e-4,
W = 0.38,
DYNAMIC = 0.3150,
T_CONST = 1.8,
P_CONST = 0.2,
L_COEF = 0.2,
O_COEF = 1.8
)
test.py
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
TIME_STAMP = 0.1
SPEC = 449
SPECIFIC = 935
EMISSIVITY = 0.7
ABSORBTIVITY = 0.3
DYNAMIC_SPEED = 12
COEFFICIENT = 0.9506173967164384
input_KV = [-75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40, 50, 60,
80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]
viscosity_value = [7.4, 9.22, 11.18, 12.01, 12.43, 12.85, 13.28, 13.72, 14.16, 14.61, 15.06, 15.52, 15.98, 16.92, 17.88, 18.86, 20.88,
22.97, 25.69, 28.51, 31.44, 34.47, 37.6, 47.54, 63.82, 77.72, 94.62, 112.6, 131.7, 151.7, 172.7, 194.6]
input_ka = [-190, -150, -100, -75, -50, -25, -15, -10, -5, 0, 5, 10, 15, 20, 25, 30, 40,
50, 60, 80, 100, 125, 150, 175, 200, 225, 300, 412, 500, 600, 700, 800, 900, 1000, 1100]
conductivity_value = [7.82, 11.69, 16.2, 18.34, 20.41, 22.41, 23.2, 23.59, 23.97, 24.36, 24.74, 25.12, 25.5, 25.87, 26.24, 26.62,
27.35, 28.08, 28.8, 30.23, 31.62, 33.33, 35, 36.64, 38.25, 39.83, 44.41, 50.92, 55.79, 61.14, 66.32, 71.35, 76.26, 81.08, 85.83]
def viscosity(input):
fq = interp1d(input_KV,
viscosity_value, kind='quadratic')
return (fq(input)*10e-6)
def conductivity(input):
fq = interp1d(input_ka,
conductivity_value, kind='quadratic')
return (fq(input)*10e-3)
def calculation(Param, sig, sig_p, sig_t):
new_col1 = np.empty(len(sig_p))
new_col1[0] = sig_t[0]
my_goal = np.empty(len(sig_p))
my_goal[0] = sig_t[0]
calc1 = COEFFICIENT * Param.RATIO * sig_p * sig /2
for n in range(1, len(sig_p)):
calc2 = EMISSIVITY * Param.A * (new_col1[n-1]**4 - sig_t[n]**4)
Ka = conductivity(sig_t[n])
if sig[n] == 0:
h = Param.O_COEF
else :
KV = viscosity(sig_t[n])
if sig[n] < DYNAMIC_SPEED:
h = (0.7*(sig[n]/KV)**0.4) * Ka + Param.O_COEF
else :
h = (0.04*(sig[n])/KV**0.8) * Ka + Param.L_COEF
calc3 = h * Param.A * (new_col1[n-1] - sig_t[n])
calc4 = Ka *Param.A * (new_col1[n-1] - sig_t[n]) / Param.T
a1 = (calc1[n] - (calc2 + calc3 + calc4)) / (SPEC * Param.M)
new_col1[n] = new_col1[n-1] + a1 * TIME_STAMP
if sig_p[n] == 0 :
val1 = ABSORBTIVITY * Param.FACTOR * calc2
elif (sig_p[n] > 0) & (sig_p[n] <= 20):
val1 = ABSORBTIVITY * Param.FACTOR * calc2* (20-sig_p[n])/20 + ((1-COEFFICIENT) * calc1[n] / (4)) * sig_p[n] / 20
else:
val1 = (1-COEFFICIENT) * calc1[n] / 4
if sig[n] == 0:
val2 = Param.T_CONST
else:
h_bar = Param.P_CONST * (sig[n] *Param.DYNAMIC)**0.8
val2 = h_bar * Param.S * (my_goal[n-1] - sig_t[n])
a2 = (val1 - (val2)) / (SPECIFIC * Param.W)
my_goal[n] = my_goal[n-1] + a2 * TIME_STAMP
if my_goal[n] < sig_t[0] : my_goal[n] = sig_t[0]
return my_goal
df = pd.read_csv('data.csv', index_col=0)
df['newcol1'] = calculation(Param1, df['sig1'].values, df[sig_p].values, df['sig_t'].values)
df['newcol2'] = calculation(Param1, df['sig2'].values, df[sig_p].values, df['sig_t'].values)
df['newcol3'] = calculation(Param2, df['sig3'].values, df[sig_p].values, df['sig_t'].values)
df['newcol4'] = calculation(Param2, df['sig4'].values, df[sig_p].values, df['sig_t'].values)
I now need to apply this function to several million rows and it's impossibly slow so I'm trying to figure out the best way to speed it up. I've heard that Cython can increase the speed of functions but I have no experience with it (and I'm new to both pandas and python).
My question is if there is anyway to enhance or speed up this computation method.
I'm run this python code on AWS(sagemaker>notebook instance, jupyter) and my computer OS is window.