To feed a network with multidimensional arrays in tensorflow
Here is a good example of how to read a .csv (2d array, for one GT reference one time row).
Balance and SMOTE the ground true GT data and have TF process and train it as a
- Multidimension, 3d array (with time windows), for one GT reference N previous time rows. Explanation HERE
- Monodimension, instead of , 2d array , for one GT reference
one time row. Explanation NO HERE
Very good simple Example: https://github.com/Leci37/stocks-prediction-Machine-learning-RealTime-telegram/blob/master/Tutorial/RUN_buy_sell_Tutorial_3W_5min_RT.py
3d data training, from .csv information
For TF to fit() multidimensional 3D arrays requires , for example a code:
Full code here: https://github.com/Leci37/stocks-prediction-Machine-learning-RealTime-telegram/blob/master/Model_train_TF_multi_onBalance.py
train_features
is a 3d array
#DATOS desequilibrados https://www.tensorflow.org/tutorials/structured_data/imbalanced_data
def train_TF_Multi_dimension_onBalance(multi_data: Data_multidimension, model_h5_name, model_type : _KEYS_DICT.MODEL_TF_DENSE_TYPE_MULTI_DIMENSI):
# 1.0 LOAD the data with TF format (split, 3D, SMOTE and balance)
array_aux_np, train_labels, val_labels, test_labels, train_features, val_features, test_features, bool_train_labels = multi_data.get_all_data()
# TRAIN
neg, pos = np.bincount(array_aux_np) #(df[Y_TARGET])
initial_bias = np.log([pos / neg])
# 2.0 STOP EARLY load and created a CustomEarlyStopping to avoid overfit
resampled_steps_per_epoch = np.ceil(2.0 * neg / BATCH_SIZE)
early_stopping = Utils_model_predict.CustomEarlyStopping(patience=8)
# 3.0 TRAIN get the model from list of models objects and train
model = multi_data.get_dicts_models_multi_dimension(model_type)
model_history = model.fit(
x=train_features, y=train_labels ,
epochs=EPOCHS,
steps_per_epoch=resampled_steps_per_epoch,
callbacks=[early_stopping], # callbacks=[early_stopping, early_stopping_board],
validation_data=(val_features, val_labels), verbose=0)
# 3.1 Save the model to reuse into .h5 file
model.save(MODEL_FOLDER_TF_MULTI + model_h5_name)
print(" Save model Type MULTI TF: " + model_type.value +" Path: ", MODEL_FOLDER_TF_MULTI + model_h5_name)
# 4.0 Eval de model with test_features this data had splited , and the .h5 model never see it
predit_test = model.predict(test_features).reshape(-1,)
To get the data in the correct 3d array format is required: (The example is from unbalanced GT data and contemplates the corrections balance and smote )
Full code here: https://github.com/Leci37/stocks-prediction-Machine-learning-RealTime-telegram/blob/master/Data_multidimension.py
1.0 Get 2D array from .csv with one column of GT and create a MULTIDIMENSION
2.0 SCALER scaling the data before, save a .scal file (it will be used to know how to scale the model for future predictions )
2.1 Let's put real groound True Y_TARGET in a copy of scaled dataset
3.0 SPLIT Ok we should split in 3 train val and test
4.0 SMOTE train_df to balance the data since there are few positive inputs, you have to generate "neighbors" of positive inputs. only in
the df_train.
5 PREPARE the data to be entered in TF with the correct (3d array) dimensions
5.1 insert Y_TARGET labels to 2D array required for TF
6 DISPLAY show the df format before accessing TF
def load_split_data_multidimension(self):
df = Utils_model_predict.load_and_clean_DF_Train_from_csv(self.path_CSV, self.op_buy_sell, self.columns_selection) #
# SMOTE and Tomek links
# The SMOTE oversampling approach could generate noisy samples since it creates synthetic data. To solve this problem, after SMOTE, we could use undersampling techniques to clean up. We’ll use the Tomek links undersampling technique in this example.
# Utils_plotter.plot_2d_space(df.drop(columns=[Y_TARGET]).iloc[:,4:5] , df[Y_TARGET], path = "SMOTE_antes.png")
array_aux_np = df[Y_TARGET] # TODO antes o despues del balance??
self.array_aux_np = array_aux_np
print("1.0 ADD MULTIDIMENSION Get 2D array , with BACHT_SIZE_LOOKBACK from "backward glances".")
# Values go from (10000 rows, 10 columns ) to (10000 rows, ( 10-1[groundTrue] * 10 dimensions ) columns ) but for the moment it does not go to 3d array remains 2d.
# df.shape: (1000, 10) to (1000, 90)
arr_mul_labels, arr_mul_features = Utils_model_predict.df_to_df_multidimension_array_2D(df.reset_index(drop=True), BACHT_SIZE_LOOKBACK = self.BACHT_SIZE_LOOKBACK)
shape_imput_3d = (-1,self.BACHT_SIZE_LOOKBACK, len(df.columns)-1) # (-1, 10, 12)
print("1.1 validate the structure of the data, this can be improved by")
arr_vali = arr_mul_features.reshape(shape_imput_3d) # 5077, 10, 12
for i in range(1, arr_vali.shape[0], self.BACHT_SIZE_LOOKBACK * 3):
list_fails_dates = [x for x in arr_vali[i][:, 0] if not (2018 <= datetime.fromtimestamp(x).year <= 2024)]
if list_fails_dates:
Logger.logr.error("The dates of the new 2D array do not appear in the first column. ")
raise ValueError("The dates of the new 2D array do not appear in the first column. ")
print("2.0 SCALER scaling the data before, save a .scal file (it will be used to know how to scale the model for future predictions )")
# Do I have to scale now or can I wait until after I split
# You can scale between the following values _KEYS_DICT.MIN_SCALER, _KEYS_DICT.MAX_SCALER
# " that you learn for your scaling so that doing scaling before or after may give you the same results (but this depends on the actual scaling function)." https://datascience.stackexchange.com/questions/71515/should-i-scale-data-before-or-after-balancing-dataset
# TODO verify the correct order to "scaler split and SMOTE" order SMOTE. sure: SMOTE only aplay on train_df
arr_mul_features = Utils_model_predict.scaler_min_max_array(arr_mul_features,path_to_save= _KEYS_DICT.PATH_SCALERS_FOLDER+self.name_models_stock+".scal")
arr_mul_labels = Utils_model_predict.scaler_min_max_array(arr_mul_labels.reshape(-1,1))
print("2.1 Let's put real groound True Y_TARGET in a copy of scaled dataset")
df_with_target = pd.DataFrame(arr_mul_features)
df_with_target[Y_TARGET] = arr_mul_labels.reshape(-1,)
print("3.0 SPLIT Ok we should split in 3 train val and test")
# "you divide your data first and then apply synthetic sampling SMOTE on the training data only" https://datascience.stackexchange.com/questions/15630/train-test-split-after-performing-smote
# CAUTION SMOTE generates twice as many rows
train_df, test_df = train_test_split(df_with_target, test_size=0.18, shuffle=self.will_shuffle) # Shuffle in a time series? hmmm
train_df, val_df = train_test_split(train_df, test_size=0.35, shuffle=self.will_shuffle) # Shuffle in a time series? hmmm
# Be carefull not to touch test_df, val_df
# Apply smote only to train_df but first remove Y_TARGET from train_df
print("3.1 Create a array 2d form dfs . Remove Y_target from train_df, because that's we want to predict and that would be cheating")
train_df_x = np.asarray(train_df.drop(columns=[Y_TARGET] ) )
# In train_df_y We drop everything except Y_TARGET
train_df_y = np.asarray(train_df[Y_TARGET] )
print("4.0 SMOTE train_df to balance the data since there are few positive inputs, you have to generate "neighbors" of positive inputs. only in the df_train.")
# Now we can smote only train_df . Doing the smote with 2D, with 3D is not possible.
X_smt, y_smt = Utils_model_predict.prepare_to_split_SMOTETomek_01(train_df_x, train_df_y)
print("4.1 Let's put real groound True Y_TARGET in a copy of scaled dataset")
train_cleaned_df_target = pd.DataFrame(X_smt)
train_cleaned_df_target[Y_TARGET] = y_smt.reshape(-1,)
#the SMOTE leaves the positives very close together
train_cleaned_df_target = shuffle(train_cleaned_df_target)
print("5 PREPARE the data to be entered in TF with the correct dimensions")
print("5.1 pass Y_TARGET labels to 2D array required for TF")
train_labels = np.asarray(train_cleaned_df_target[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
bool_train_labels = (train_labels != 0).reshape((-1))
val_labels = np.asarray(val_df[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
test_labels = np.asarray(test_df[Y_TARGET]).astype('float32').reshape((-1, 1)) # no need already 2d
print("5.2 all array windows that were in 2D format (to overcome the SCALER and SMOTE methods),")
# must be displayed in 3D for TF by format of varible shape_imput_3d
train_features = np.array(train_cleaned_df_target.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d)
test_features = np.array(test_df.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d )
val_features = np.array(val_df.drop(columns=[Y_TARGET]) ).reshape(shape_imput_3d )
print("6 DISPLAY show the df format before accessing TF")
Utils_model_predict.log_shapes_trains_val_data(test_features, test_labels, train_features, train_labels, val_features, val_labels)
self.imput_shape = (train_features.shape[1], train_features.shape[2])
self.train_labels = train_labels
self.val_labels = val_labels
self.test_labels = test_labels
self.train_features = train_features
self.val_features = val_features
self.test_features = test_features
self.bool_train_labels = bool_train_labels
def get_all_data(self):
return self.array_aux_np, self.train_labels, self.val_labels, self.test_labels, self.train_features, self.val_features, self.test_features, self.bool_train_labels