numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m))

Question

Initially had everything written pandas and for this exploratory exercise i had did a lot of groupby's and while running with the whole data it was ran for 1h 26m. Over the last weekend, i had changed everything from pandas to using numpy, currently it took(Wall time: 38min 27s). I would like to know if it can be further improved

While converting to numpy, i had additionally used numpy_indexed.

Overall what i am doing is calling the below function this loop(i have read in lots of places loops are bad). Dataset has around 657058 rows and there around 5000 tickers.

for idx, ticker in enumerate(ticker_list):
    ...
    df_temp = weekly_trend_analysis(exchange, df_weekly, df_daily)
    ...        
    df_weekly_all = pd.concat([df_weekly_all, df_temp], sort=False)

def weekly_trend_analysis(exchange, df_weekly_all, df_daily):

    if exchange == 'BSE':
        ticker = df_daily.iloc[0]['sc_code']
    else:
        ticker = df_daily.iloc[0]['symbol']

    arr_yearWeek = df_daily['yearWeek'].to_numpy()
    arr_close = df_daily['close'].to_numpy()
    arr_prevclose = df_daily['prevclose'].to_numpy()
    arr_chng = df_daily['chng'].to_numpy()
    arr_chngp = df_daily['chngp'].to_numpy()
    arr_ts = df_daily['ts'].to_numpy()
    arr_volumes = df_daily['volumes'].to_numpy()

    # Close
    arr_concat = np.column_stack((arr_yearWeek, arr_close))
    npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])

    #a = df_temp[['yearWeek', 'close']].to_numpy()
    yearWeek, daysTraded = np.unique(arr_concat[:,0], return_counts=True)

    cmaxs, cmins = [], []
    first, last, wChng, wChngp = [], [], [], []
    for idx,subarr in enumerate(npi_gb):
        cmaxs.append( np.amax(subarr) )
        cmins.append( np.amin(subarr) )
        first.append(subarr[0])
        last.append(subarr[-1])
        wChng.append( subarr[-1] - subarr[0] )
        wChngp.append( ( (subarr[-1] / subarr[0]) * 100) - 100 )

    #npi_gb.clear()
    arr_concat = np.empty((100,100))

    # Chng
    arr_concat = np.column_stack((arr_yearWeek, arr_chng))
    npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])

    HSDL, HSDG = [], []
    for idx,subarr in enumerate(npi_gb):
        HSDL.append( np.amin(subarr) )
        HSDG.append( np.amax(subarr) )

    #npi_gb.clear()
    arr_concat = np.empty((100,100))

    # Chngp
    arr_concat = np.column_stack((arr_yearWeek, arr_chngp))
    npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])

    HSDLp, HSDGp = [], []
    for idx,subarr in enumerate(npi_gb):
        HSDLp.append( np.amin(subarr) )
        HSDGp.append( np.amax(subarr) )

    #npi_gb.clear()
    arr_concat = np.empty((100,100))

    # Last Traded Date of the Week
    i = df_daily[['yearWeek', 'ts']].to_numpy()
    j = npi.group_by(i[:, 0]).split(i[:, 1])

    lastTrdDoW = []
    for idx,subarr in enumerate(j):
        lastTrdDoW.append( subarr[-1] )

    i = np.empty((100,100))
    #j.clear()

    # Times inreased
    TI = np.where(arr_close > arr_prevclose, 1, 0)

    # Below npi_gb_yearWeekTI is used in volumes section
    arr_concat = np.column_stack((arr_yearWeek, TI))
    npi_gb_yearWeekTI = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])

    tempArr, TI = npi.group_by(arr_yearWeek).sum(TI)

    # Volume ( dependent on above section value t_group , thats the reason to move from top to here)
    arr_concat = np.column_stack((arr_yearWeek, arr_volumes))
    npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])

    vmaxs, vavgs, volAvgWOhv, HVdAV, CPveoHVD, lastDVotWk, lastDVdAV = [], [], [], [], [], [], []
    for idx,subarr in enumerate(npi_gb):
        vavgs.append( np.mean(subarr) )
        ldvotWk = subarr[-1]
        lastDVotWk.append(ldvotWk)

        #print(idx, 'O - ',subarr, np.argmax(subarr), ', average : ',np.mean(subarr))
        ixDel = np.argmax(subarr)
        hV = subarr[ixDel]
        vmaxs.append( hV )

        if(len(subarr)>1):
            subarr = np.delete(subarr, ixDel)
            vawoHV = np.mean(subarr)
        else:
            vawoHV = np.mean(subarr)
        volAvgWOhv.append( vawoHV )
        HVdAV.append(hV / vawoHV)
        CPveoHVD.append( npi_gb_yearWeekTI[idx][ixDel] )
        lastDVdAV.append(ldvotWk / vawoHV)    

    #npi_gb.clear()
    arr_concat = np.empty((100,100))

    # Preparing the dataframe
    # yearWeek and occurances 
    #yearWeek, daysTraded = np.unique(a[:,0], return_counts=True)
    yearWeek = yearWeek.astype(int)
    HSDL = np.round(HSDL,2)
    HSDG = np.round(HSDG,2)
    HSDLp = np.round(HSDLp,2)
    HSDGp = np.round(HSDGp,2)

    first = np.round(first,2)
    last = np.round(last,2)
    wChng = np.round(wChng,2)
    wChngp = np.round(wChngp,2)

    vavgs = np.array(vavgs).astype(int)
    volAvgWOhv = np.array(volAvgWOhv).astype(int)
    HVdAV = np.round(HVdAV,2)

    dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
                ,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
                ,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
                ,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
    df_weekly = pd.DataFrame(data=dict_temp)

    df_weekly['sc_code'] = ticker

    cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
             , 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
             , 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']

    df_weekly = df_weekly[cols].copy()

    # df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
    if df_weekly_all.shape[0] == 0:
        df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))       

    # Removing all yearWeek in df_weekly2 from df_weekly
    a = set(df_weekly_all['yearWeek'])
    b = set(df_weekly['yearWeek'])
    c = list(a.difference(b))
    #print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
    df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()

    # Append the latest week data to df_weekly
    df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
    #print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))    

    return df_weekly_all

Input data

ts = ['2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00']
sc_code = ['500002','500002','500002','500002','500002','500002','500002','500002','500002','500002']
high = [1326.6, 208.45, 732.15, 14.87, 1979.0, 57.8, 11.55, 1.68, 8.1, 139.4]
low = [1306.35, 204.0, 717.05, 13.41, 1937.65, 54.65, 11.2, 1.52, 7.75, 135.65]
close = [1313.55, 206.65, 723.05, 13.53, 1955.25, 56.0, 11.21, 1.68, 8.1, 136.85]
prevclose = [1319.85, 202.95, 718.95, 14.29, 1967.3, 54.65, 11.22, 1.6, 7.75, 135.05]
volumes = [7785, 6150, 21641, 46296, 707019, 40089, 25300, 5920, 500, 235355]
yearWeek = [201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913]
chng = [-6.29, 3.70, 4.09, -0.75, -12.04, 1.35, -0.09, 0.079, 0.34, 1.79]
chngp = [-0.48, 1.82, 0.57, -5.32, -0.61, 2.47, -0.09, 5.0, 4.52, 1.33]


dict_temp = {'ts':ts, 'sc_code':sc_code, 'high':high, 'low':low, 'close':close, 'prevclose':prevclose, 'volumes':volumes, 'yearWeek':yearWeek, 'chng':chng, 'chngp':chngp}
df_weekly = pd.DataFrame(data=dict_temp)

Adding line-profiler details,

('getcwd : ', '/home/bobby_dreamer')
Timer unit: 1e-06 s

Total time: 0.043637 s
File: BTD-Analysis1V3.py
Function: weekly_trend_analysis at line 36

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    36                                           def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
    37                                           
    38         1          3.0      3.0      0.0      if exchange == 'BSE':
    39         1        963.0    963.0      2.2          ticker = df_daily.iloc[0]['sc_code']
    40                                               else:
    41                                                   ticker = df_daily.iloc[0]['symbol']
    42                                           
    95                                               # Last Traded Date of the Week
    96         1       3111.0   3111.0      7.1      i = df_daily[['yearWeek', 'ts']].to_numpy()
    97         1        128.0    128.0      0.3      j = npi.group_by(i[:, 0]).split(i[:, 1])
    98                                           
   160                                           
   161         1          3.0      3.0      0.0      dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
   162         1          2.0      2.0      0.0                  ,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
   163         1          2.0      2.0      0.0                  ,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
   164         1          2.0      2.0      0.0                  ,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
   165         1       3677.0   3677.0      8.4      df_weekly = pd.DataFrame(data=dict_temp)
   166                                           
   167         1       1102.0   1102.0      2.5      df_weekly['sc_code'] = ticker
   168                                           
   169         1          3.0      3.0      0.0      cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
   170         1          1.0      1.0      0.0               , 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
   171         1          2.0      2.0      0.0               , 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
   172                                           
   173         1       2816.0   2816.0      6.5      df_weekly = df_weekly[cols].copy()
   174                                                   
   175                                               # df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
   176         1         13.0     13.0      0.0      if df_weekly_all.shape[0] == 0:
   177         1      20473.0  20473.0     46.9          df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))       
   178                                                   
   179                                               # Removing all yearWeek in df_weekly2 from df_weekly
   180         1        321.0    321.0      0.7      a = set(df_weekly_all['yearWeek'])
   181         1        190.0    190.0      0.4      b = set(df_weekly['yearWeek'])
   182         1          5.0      5.0      0.0      c = list(a.difference(b))
   183                                               #print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
   184         1       1538.0   1538.0      3.5      df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
   185                                           
   186                                               # Append the latest week data to df_weekly
   187         1       6998.0   6998.0     16.0      df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
   188                                               #print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))    
   189                                                   
   190         1          2.0      2.0      0.0      return df_weekly_all

After reviewing the above profile, made changes to code which consumed more time, basically added more numpy code removed pandas in the function. Below code when run with whole data took only Wall time: 7min 47s.

Encountered some numpy errors like below, handled via writing intermediate files. I am using windows machine and intermediate files were < 3MB. Not sure if there were any limitations.

MemoryError: Unable to allocate array with shape (82912, 22) and data type <U32

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
    38                                           def weekly_trend_analysis_np(exchange, np_weekly_all, df_daily):
    39                                           
    40         1          4.0      4.0      0.0      if exchange == 'BSE':
    43         1        152.0    152.0      1.2          ticker = df_daily['sc_code'].to_numpy()[0]
    44                                               else:
    47                                                   ticker = df_daily['symbol'].to_numpy()[0]
    48                                           
   101                                               # Last Traded Date of the Week
   102         1         33.0     33.0      0.3      arr_concat = np.column_stack((arr_yearWeek, arr_ts))
   103         1        341.0    341.0      2.6      npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
   104                                           
   152         1          5.0      5.0      0.0      yearWeek = yearWeek.astype(int)
   153         1         59.0     59.0      0.5      HSDL = np.round(HSDL,2)
   154         1         26.0     26.0      0.2      HSDG = np.round(HSDG,2)
   155         1         23.0     23.0      0.2      HSDLp = np.round(HSDLp,2)
   156         1         23.0     23.0      0.2      HSDGp = np.round(HSDGp,2)
   157                                           
   158         1         23.0     23.0      0.2      first = np.round(first,2)
   159         1         23.0     23.0      0.2      last = np.round(last,2)
   160         1         23.0     23.0      0.2      wChng = np.round(wChng,2)
   161         1         23.0     23.0      0.2      wChngp = np.round(wChngp,2)
   162                                           
   163         1         12.0     12.0      0.1      vavgs = np.array(vavgs).astype(int)
   164         1         16.0     16.0      0.1      volAvgWOhv = np.array(volAvgWOhv).astype(int)
   165         1         24.0     24.0      0.2      HVdAV = np.round(HVdAV,2)
   166                                           
   167         1         16.0     16.0      0.1      ticker = np.full(yearWeek.shape[0], ticker)
   168         1          2.0      2.0      0.0      np_weekly = np.column_stack((ticker, yearWeek, lastTrdDoW, daysTraded, cmins, cmaxs, vavgs, vmaxs, HSDL
   169         1          2.0      2.0      0.0                                 , HSDG, HSDLp, HSDGp, first, last, wChng, wChngp, TI, volAvgWOhv, HVdAV
   170         1        546.0    546.0      4.2                                 , CPveoHVD, lastDVotWk, lastDVdAV))
   171                                               
   173         1          2.0      2.0      0.0      if len(np_weekly_all) > 0:
   175         1          2.0      2.0      0.0          a = np_weekly_all[:,1] 
   176         1          1.0      1.0      0.0          b = np_weekly[:,1] 
   177         1        205.0    205.0      1.6          tf_1 = np.isin(a, b, invert=True) 
   179         1         13.0     13.0      0.1          t_result = list(compress(range(len(tf_1)), tf_1)) 
   181         1         13.0     13.0      0.1          np_weekly_all = np_weekly_all[t_result]
   182         1         40.0     40.0      0.3          np_weekly_all = np.vstack((np_weekly_all, np_weekly))    
   183                                               else:
   184                                                   np_weekly_all = []
   185                                                   np_weekly_all = np.vstack((np_weekly))    
   186                                                   
   187         1          2.0      2.0      0.0      return np_weekly_all

I would glad to hear your suggestions and thanks for pointing to profiler, i didn't know about that.

Please provide usable input and expected output with a description of the process so we don't have to reverse engineer your code — Alain T., Apr 06 '20 at 15:49
Did you profile to see exactly which line(s) were the bottleneck(s)? — Mad Physicist, Apr 06 '20 at 16:13

score 0 · Answer 1 · answered Apr 07 '20 at 20:03

0

Edited and posted the line profiler code above. I have updated the code from pandas to numpy which had reduced the times considerable from inital 1h 26min to now 7mins.

answered Apr 07 '20 at 20:03

bobby.dreamer

366
4
19

numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m))

1 Answers1