I'm wondering if I am doing this the correct way - I am new to Python, and tried to figure this out as best I could, but now I've almost completed my project, part of it is painfully slow.
I have pulled down daily OHLC bars, filtered through them to find the gappers, and then I have created a Class which goes over the minute data for those daily gappers, and returns important information which I use later for my backtests, e.g whether we've hit the 200 ema in the pre market.
Here's what my code looks like:
# get the dates for our gaps
import os.path
import glob
import numpy as np
from pathlib import Path
folder = "daily_bars_filtered/*.csv"
df_gapper_list = []
df_intraday_analysis = []
# loop through the daily gappers
for fname in glob.glob(folder)[:13]:
ticker = Path(fname).stem
df = pd.read_csv(fname)
df['ticker'] = ticker
df_gapper_list.append(df)
print(f'downloading {ticker}')
# get the intraday bars data for the entire dates
file_ = 'intraday_bars_gapped_new/{}.csv'.format(ticker)
df_minute_bars = pd.read_csv(file_)
# for the current stocks daily gappers, anaylise the data and return the data (fades, ohlc, market periods, etc)
for index, row in df.iterrows():
session_data = SESSION_DATA(
# pass in the minute data
df_minute_bars,
# pass in the daily data and ticker
row['open'],
row['high'],
row['low'],
row['close'],
row['date'],
ticker,
row['previous close'],
row['volume']).intraday_data
df_intraday_analysis.append(session_data)
final_df = pd.concat(df_intraday_analysis,ignore_index=True)
display(final_df)
print(f'length of final_df is {len(final_df)}')
final_df.to_csv('mikeys-spreadsheet2222.csv', index=False)
And here's what my class looks like:
import pandas as pd
from datetime import datetime, time
from IPython.display import display
import math
class SESSION_DATA:
def __init__(self, minute_data, open, high, low, close, date, ticker, previous_close, volume):
self.minute_data = minute_data
self.date = date
self.ticker = ticker
self.intraday_data = []
self.open = open
self.high = high
self.low = low
self.close = close
self.previous_close = previous_close
self.volume = volume
df = self.minute_data
df_current_day = df[(df['date'] == self.date)]
df_current_day['time'] = pd.to_datetime(df['time']).dt.time
self.after_hours_high = GET_TIME_PERIOD_DATA('after_hours', df_current_day).high
self.after_hours_runner = bigger_smaller(self.after_hours_high, self.previous_close)
self.pre_market_high = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high
self.pre_market_high_time = GET_TIME_PERIOD_DATA('pre_market', df_current_day).high_time
self.new_gapper = new_gapper(self.after_hours_runner, self.early_pre_market_runner)
self.spike = abs(self.high - self.open)
df_intraday_data = pd.DataFrame({
'date': self.date,
'ticker': self.ticker,
'open': self.open,
'high': self.high,
'low': self.low,
'close': self.close,
'prev close': self.previous_close,
'volume': self.volume,
'PM hi': self.pre_market_high,
'PM hi time': self.pre_market_high_time,
'PM 200 ema hit': HIT_200_EMA('pre_market', df_current_day).hit_200_ema,
'New gapper': self.new_gapper,
'Spike': self.spike,
'Pop over 10%': pop_over_10(self.spike),
}, index=[0])
self.intraday_data = df_intraday_data
Is there a better way of achieving what I am doing, maybe without the use of iterrows or using something like numpy?