Initially had everything written pandas and for this exploratory exercise i had did a lot of groupby's and while running with the whole data it was ran for 1h 26m. Over the last weekend, i had changed everything from pandas to using numpy, currently it took(Wall time: 38min 27s). I would like to know if it can be further improved
While converting to numpy, i had additionally used numpy_indexed
.
Overall what i am doing is calling the below function this loop(i have read in lots of places loops are bad). Dataset has around 657058 rows and there around 5000 tickers.
for idx, ticker in enumerate(ticker_list):
...
df_temp = weekly_trend_analysis(exchange, df_weekly, df_daily)
...
df_weekly_all = pd.concat([df_weekly_all, df_temp], sort=False)
def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
if exchange == 'BSE':
ticker = df_daily.iloc[0]['sc_code']
else:
ticker = df_daily.iloc[0]['symbol']
arr_yearWeek = df_daily['yearWeek'].to_numpy()
arr_close = df_daily['close'].to_numpy()
arr_prevclose = df_daily['prevclose'].to_numpy()
arr_chng = df_daily['chng'].to_numpy()
arr_chngp = df_daily['chngp'].to_numpy()
arr_ts = df_daily['ts'].to_numpy()
arr_volumes = df_daily['volumes'].to_numpy()
# Close
arr_concat = np.column_stack((arr_yearWeek, arr_close))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
#a = df_temp[['yearWeek', 'close']].to_numpy()
yearWeek, daysTraded = np.unique(arr_concat[:,0], return_counts=True)
cmaxs, cmins = [], []
first, last, wChng, wChngp = [], [], [], []
for idx,subarr in enumerate(npi_gb):
cmaxs.append( np.amax(subarr) )
cmins.append( np.amin(subarr) )
first.append(subarr[0])
last.append(subarr[-1])
wChng.append( subarr[-1] - subarr[0] )
wChngp.append( ( (subarr[-1] / subarr[0]) * 100) - 100 )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chng
arr_concat = np.column_stack((arr_yearWeek, arr_chng))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDL, HSDG = [], []
for idx,subarr in enumerate(npi_gb):
HSDL.append( np.amin(subarr) )
HSDG.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chngp
arr_concat = np.column_stack((arr_yearWeek, arr_chngp))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDLp, HSDGp = [], []
for idx,subarr in enumerate(npi_gb):
HSDLp.append( np.amin(subarr) )
HSDGp.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Last Traded Date of the Week
i = df_daily[['yearWeek', 'ts']].to_numpy()
j = npi.group_by(i[:, 0]).split(i[:, 1])
lastTrdDoW = []
for idx,subarr in enumerate(j):
lastTrdDoW.append( subarr[-1] )
i = np.empty((100,100))
#j.clear()
# Times inreased
TI = np.where(arr_close > arr_prevclose, 1, 0)
# Below npi_gb_yearWeekTI is used in volumes section
arr_concat = np.column_stack((arr_yearWeek, TI))
npi_gb_yearWeekTI = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
tempArr, TI = npi.group_by(arr_yearWeek).sum(TI)
# Volume ( dependent on above section value t_group , thats the reason to move from top to here)
arr_concat = np.column_stack((arr_yearWeek, arr_volumes))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
vmaxs, vavgs, volAvgWOhv, HVdAV, CPveoHVD, lastDVotWk, lastDVdAV = [], [], [], [], [], [], []
for idx,subarr in enumerate(npi_gb):
vavgs.append( np.mean(subarr) )
ldvotWk = subarr[-1]
lastDVotWk.append(ldvotWk)
#print(idx, 'O - ',subarr, np.argmax(subarr), ', average : ',np.mean(subarr))
ixDel = np.argmax(subarr)
hV = subarr[ixDel]
vmaxs.append( hV )
if(len(subarr)>1):
subarr = np.delete(subarr, ixDel)
vawoHV = np.mean(subarr)
else:
vawoHV = np.mean(subarr)
volAvgWOhv.append( vawoHV )
HVdAV.append(hV / vawoHV)
CPveoHVD.append( npi_gb_yearWeekTI[idx][ixDel] )
lastDVdAV.append(ldvotWk / vawoHV)
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Preparing the dataframe
# yearWeek and occurances
#yearWeek, daysTraded = np.unique(a[:,0], return_counts=True)
yearWeek = yearWeek.astype(int)
HSDL = np.round(HSDL,2)
HSDG = np.round(HSDG,2)
HSDLp = np.round(HSDLp,2)
HSDGp = np.round(HSDGp,2)
first = np.round(first,2)
last = np.round(last,2)
wChng = np.round(wChng,2)
wChngp = np.round(wChngp,2)
vavgs = np.array(vavgs).astype(int)
volAvgWOhv = np.array(volAvgWOhv).astype(int)
HVdAV = np.round(HVdAV,2)
dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
df_weekly = pd.DataFrame(data=dict_temp)
df_weekly['sc_code'] = ticker
cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
, 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
, 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
df_weekly = df_weekly[cols].copy()
# df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
if df_weekly_all.shape[0] == 0:
df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
# Removing all yearWeek in df_weekly2 from df_weekly
a = set(df_weekly_all['yearWeek'])
b = set(df_weekly['yearWeek'])
c = list(a.difference(b))
#print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
# Append the latest week data to df_weekly
df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
#print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
return df_weekly_all
Input data
ts = ['2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00']
sc_code = ['500002','500002','500002','500002','500002','500002','500002','500002','500002','500002']
high = [1326.6, 208.45, 732.15, 14.87, 1979.0, 57.8, 11.55, 1.68, 8.1, 139.4]
low = [1306.35, 204.0, 717.05, 13.41, 1937.65, 54.65, 11.2, 1.52, 7.75, 135.65]
close = [1313.55, 206.65, 723.05, 13.53, 1955.25, 56.0, 11.21, 1.68, 8.1, 136.85]
prevclose = [1319.85, 202.95, 718.95, 14.29, 1967.3, 54.65, 11.22, 1.6, 7.75, 135.05]
volumes = [7785, 6150, 21641, 46296, 707019, 40089, 25300, 5920, 500, 235355]
yearWeek = [201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913]
chng = [-6.29, 3.70, 4.09, -0.75, -12.04, 1.35, -0.09, 0.079, 0.34, 1.79]
chngp = [-0.48, 1.82, 0.57, -5.32, -0.61, 2.47, -0.09, 5.0, 4.52, 1.33]
dict_temp = {'ts':ts, 'sc_code':sc_code, 'high':high, 'low':low, 'close':close, 'prevclose':prevclose, 'volumes':volumes, 'yearWeek':yearWeek, 'chng':chng, 'chngp':chngp}
df_weekly = pd.DataFrame(data=dict_temp)
Adding line-profiler details,
('getcwd : ', '/home/bobby_dreamer')
Timer unit: 1e-06 s
Total time: 0.043637 s
File: BTD-Analysis1V3.py
Function: weekly_trend_analysis at line 36
Line # Hits Time Per Hit % Time Line Contents
==============================================================
36 def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
37
38 1 3.0 3.0 0.0 if exchange == 'BSE':
39 1 963.0 963.0 2.2 ticker = df_daily.iloc[0]['sc_code']
40 else:
41 ticker = df_daily.iloc[0]['symbol']
42
95 # Last Traded Date of the Week
96 1 3111.0 3111.0 7.1 i = df_daily[['yearWeek', 'ts']].to_numpy()
97 1 128.0 128.0 0.3 j = npi.group_by(i[:, 0]).split(i[:, 1])
98
160
161 1 3.0 3.0 0.0 dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
162 1 2.0 2.0 0.0 ,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
163 1 2.0 2.0 0.0 ,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
164 1 2.0 2.0 0.0 ,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
165 1 3677.0 3677.0 8.4 df_weekly = pd.DataFrame(data=dict_temp)
166
167 1 1102.0 1102.0 2.5 df_weekly['sc_code'] = ticker
168
169 1 3.0 3.0 0.0 cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
170 1 1.0 1.0 0.0 , 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
171 1 2.0 2.0 0.0 , 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
172
173 1 2816.0 2816.0 6.5 df_weekly = df_weekly[cols].copy()
174
175 # df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
176 1 13.0 13.0 0.0 if df_weekly_all.shape[0] == 0:
177 1 20473.0 20473.0 46.9 df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
178
179 # Removing all yearWeek in df_weekly2 from df_weekly
180 1 321.0 321.0 0.7 a = set(df_weekly_all['yearWeek'])
181 1 190.0 190.0 0.4 b = set(df_weekly['yearWeek'])
182 1 5.0 5.0 0.0 c = list(a.difference(b))
183 #print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
184 1 1538.0 1538.0 3.5 df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
185
186 # Append the latest week data to df_weekly
187 1 6998.0 6998.0 16.0 df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
188 #print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
189
190 1 2.0 2.0 0.0 return df_weekly_all
After reviewing the above profile, made changes to code which consumed more time, basically added more numpy code removed pandas in the function. Below code when run with whole data took only Wall time: 7min 47s.
Encountered some numpy errors like below, handled via writing intermediate files. I am using windows machine and intermediate files were < 3MB. Not sure if there were any limitations.
MemoryError: Unable to allocate array with shape (82912, 22) and data type <U32
Line # Hits Time Per Hit % Time Line Contents
==============================================================
38 def weekly_trend_analysis_np(exchange, np_weekly_all, df_daily):
39
40 1 4.0 4.0 0.0 if exchange == 'BSE':
43 1 152.0 152.0 1.2 ticker = df_daily['sc_code'].to_numpy()[0]
44 else:
47 ticker = df_daily['symbol'].to_numpy()[0]
48
101 # Last Traded Date of the Week
102 1 33.0 33.0 0.3 arr_concat = np.column_stack((arr_yearWeek, arr_ts))
103 1 341.0 341.0 2.6 npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
104
152 1 5.0 5.0 0.0 yearWeek = yearWeek.astype(int)
153 1 59.0 59.0 0.5 HSDL = np.round(HSDL,2)
154 1 26.0 26.0 0.2 HSDG = np.round(HSDG,2)
155 1 23.0 23.0 0.2 HSDLp = np.round(HSDLp,2)
156 1 23.0 23.0 0.2 HSDGp = np.round(HSDGp,2)
157
158 1 23.0 23.0 0.2 first = np.round(first,2)
159 1 23.0 23.0 0.2 last = np.round(last,2)
160 1 23.0 23.0 0.2 wChng = np.round(wChng,2)
161 1 23.0 23.0 0.2 wChngp = np.round(wChngp,2)
162
163 1 12.0 12.0 0.1 vavgs = np.array(vavgs).astype(int)
164 1 16.0 16.0 0.1 volAvgWOhv = np.array(volAvgWOhv).astype(int)
165 1 24.0 24.0 0.2 HVdAV = np.round(HVdAV,2)
166
167 1 16.0 16.0 0.1 ticker = np.full(yearWeek.shape[0], ticker)
168 1 2.0 2.0 0.0 np_weekly = np.column_stack((ticker, yearWeek, lastTrdDoW, daysTraded, cmins, cmaxs, vavgs, vmaxs, HSDL
169 1 2.0 2.0 0.0 , HSDG, HSDLp, HSDGp, first, last, wChng, wChngp, TI, volAvgWOhv, HVdAV
170 1 546.0 546.0 4.2 , CPveoHVD, lastDVotWk, lastDVdAV))
171
173 1 2.0 2.0 0.0 if len(np_weekly_all) > 0:
175 1 2.0 2.0 0.0 a = np_weekly_all[:,1]
176 1 1.0 1.0 0.0 b = np_weekly[:,1]
177 1 205.0 205.0 1.6 tf_1 = np.isin(a, b, invert=True)
179 1 13.0 13.0 0.1 t_result = list(compress(range(len(tf_1)), tf_1))
181 1 13.0 13.0 0.1 np_weekly_all = np_weekly_all[t_result]
182 1 40.0 40.0 0.3 np_weekly_all = np.vstack((np_weekly_all, np_weekly))
183 else:
184 np_weekly_all = []
185 np_weekly_all = np.vstack((np_weekly))
186
187 1 2.0 2.0 0.0 return np_weekly_all
I would glad to hear your suggestions and thanks for pointing to profiler, i didn't know about that.