I am in the process of training my LSTM neural networks that shall predict quintiles of stock price distributions. As I would like to train the model on not just one stock but a sample of 500 I wrote the below training loop that shall fit the model to each stock, save the model params and the load the params again when training the next stock. My question is if I can write the code in the for loop like below or whether I can also just use a complete dataset including all 500 stocks where data is just concatenated along the 0 axis.
The idea is, that the model iterates over each stock, the best model is then saved by the checkpoint function and is reloaded again for the fitting of the next stock.
This is the training loop I would like to use:
def compile_and_fit(model_type,model,checkpoint_path,config, stock_data,macro_data, factor_data, patience, batch_size,
num_epochs,train_set_ratio, val_set_ratio, Y_name):
"""
model = NN model,
data = stock data, factor data, macro data,
batch_size = timesteps per batch
alpha adam = learning rate optimizer
data set ratios = train_set_ratio, val_set_ratio (eg. 0.5)
"""
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='loss', #'loss'
patience=patience,
mode='min')
cp_callback = tf.keras.callbacks.ModelCheckpoint(
checkpoint_path,
monitor= 'loss',
verbose=True,
save_best_only=True,
save_freq = batch_size,
mode='min')
permno_list = stock_data.permno.unique()
test_data = pd.DataFrame()
counter = 0
for p in permno_list:
#checkpoints
if counter == 0:
trained_model = model
cp_callback = cp_callback
else:
trained_model = tf.keras.models.load_model(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,monitor= 'loss',verbose=True, save_best_only=True,save_freq = batch_size, mode='min')
stock_data_length = len(stock_data.loc[stock_data.permno==p])
train_data_stocks = stock_data.loc[stock_data.permno==p][0:int(stock_data_length*train_set_ratio)]
val_data_stocks = stock_data.loc[stock_data.permno==p][int(stock_data_length*train_set_ratio):int(stock_data_length*(val_set_ratio+train_set_ratio))]
test_data_stocks = stock_data.loc[stock_data.permno==p][int(stock_data_length*(val_set_ratio+train_set_ratio)):]
test_data = pd.concat([test_data, test_data_stocks],axis=0)
train_date_index = train_data_stocks.index.values.tolist()
val_date_index = val_data_stocks.index.values.tolist()
train_data_factors = factor_data.loc[factor_data.index.isin(train_date_index)]
train_data_macro = macro_factors.loc[macro_factors.index.isin(train_date_index)]
train_data_macro_norm = train_data_macro.copy(deep=True)
for c in train_data_macro_norm.columns:
train_data_macro_norm[c] = MinMaxScaler([-1,1]).fit_transform(pd.DataFrame(train_data_macro_norm[c]))
train_data_merged = pd.concat([train_data_factors, train_data_macro_norm],axis=1)
val_data_factors = factor_data.loc[factor_data.index.isin(val_date_index)]
val_data_macro = macro_factors.loc[macro_factors.index.isin(val_date_index)]
val_data_macro_norm = val_data_macro.copy(deep=True)
for c in val_data_macro_norm.columns:
val_data_macro_norm[c] = MinMaxScaler([-1,1]).fit_transform(pd.DataFrame(val_data_macro_norm[c]))
val_data_merged = pd.concat([val_data_factors, val_data_macro_norm],axis=1)
if model_type=='combined':
x_train_factors = []
x_train_macro = []
y_train =[]
for i in range(batch_size, len(train_data_factors)):
x_train_factors.append(train_data_factors.values[i-batch_size:i,:])
x_train_macro.append(train_data_macro_norm.values[i-batch_size:i,:])
y_train.append(train_data_stocks[Y_name].values[i])
x_train_factors, x_train_macro, y_train= np.array(x_train_factors),np.array(x_train_macro), np.array(y_train)
x_val_factors = []
x_val_macro = []
y_val =[]
for i in range(batch_size, len(val_data_factors)):
x_val_factors.append(val_data_factors.values[i-batch_size:i,:])
x_val_macro.append(val_data_macro_norm.values[i-batch_size:i,:])
y_val.append(val_data_stocks[Y_name].values[i])
x_val_factors, x_val_macro, y_val = np.array(x_val_factors),np.array(x_val_macro), np.array(y_val)
score =trained_model.evaluate([x_train_macro,x_train_factors],y_train,batch_size=batch_size)
score = list(score)
score.sort(reverse=True)
score = score[-2]
cp_callback.best = score
trained_model.fit(x=[x_train_macro,x_train_factors],y=y_train,batch_size=batch_size, epochs=num_epochs,
validation_data=[[x_val_macro,x_val_factors], y_val], callbacks=[early_stopping,cp_callback])
if model_type=='merged':
x_train_merged = []
y_train =[]
for i in range(batch_size, len(train_data_merged)):
x_train_merged.append(train_data_merged.values[i-batch_size:i,:])
y_train.append(train_data_stocks[Y_name].values[i])
x_train_merged, y_train= np.array(x_train_merged), np.array(y_train)
x_val_merged = []
y_val =[]
for i in range(batch_size, len(val_data_merged)):
x_val_merged.append(val_data_merged.values[i-batch_size:i,:])
y_val.append(val_data_stocks[Y_name].values[i])
x_val_merged, y_val = np.array(x_val_merged), np.array(y_val)
score =trained_model.evaluate(x_train_merged,y_train,batch_size=batch_size)
score = list(score)
score.sort(reverse=True)
score = score[-2]
cp_callback.best = score
trained_model.fit(x=x_train_merged,y=y_train,batch_size=batch_size, epochs=num_epochs,
validation_data=[x_val_merged, y_val], callbacks=[early_stopping,cp_callback])
return trained_model, test_data
If someone has an idea whether this works or not, I would be incredibly grateful!
In my testing I could see the mse constantly decreasing, however if the loop continues for the next stop the mse starts with avery high value again.