I'm trying to train a model to predict departure delay based on airline, day of the month, Dest and Origin. I tried several approaches but the accuracy is very low. enter image description here Fist I used the delay labels directly varying from -20 to +20 min, I tried making it easier by setting intervals so : for delays in [0 5[ => 0 [5 10] => 1 ..etc
but still the accuracy is bad and I tried several approaches ;
Changing the layers
Not normalizing the features
removing and adding new features
But still I can't find something that works
################### Load the datasetdf= dataset[['UniqueCarrier','DayofMonth','DepDelay','Dest','Origin']]
df.tail()
df = df.dropna()
df = df[(df['DepDelay'] <= 20) & (df['DepDelay'] <= 20)]
############### mask delay values
ask = (df.DepDelay > 0) & (df.DepDelay < 5)
column_name = 'DepDelay'
df.loc[mask, column_name] = 0
mask = (df.DepDelay >= 5) & (df.DepDelay < 10)
column_name = 'DepDelay'
df.loc[mask, column_name] = 1
mask = (df.DepDelay >= 10) & (df.DepDelay < 15)
column_name = 'DepDelay'
df.loc[mask, column_name] = 2
mask = (df.DepDelay >= 15) & (df.DepDelay <= 20)
column_name = 'DepDelay'
df.loc[mask, column_name] = 3
mask = (df.DepDelay >= -5) & (df.DepDelay < 0)
column_name = 'DepDelay'
df.loc[mask, column_name] = -1
mask = (df.DepDelay >= -10) & (df.DepDelay < -5)
column_name = 'DepDelay'
df.loc[mask, column_name] = -2
mask = (df.DepDelay >= -15) & (df.DepDelay < -10)
column_name = 'DepDelay'
df.loc[mask, column_name] = -3
mask = (df.DepDelay >= -20) & (df.DepDelay < -15)
column_name = 'DepDelay'
df.loc[mask, column_name] = -4
############### Splitting labels and features
y= df['DepDelay']
df.drop(columns = ['DepDelay'], inplace = True, axis = 1)
################ replacing character values
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['Dest'] = le.fit_transform(df.Dest.values)
df['Origin'] = le.fit_transform(df.Origin.values)
df['UniqueCarrier'] = le.fit_transform(df.UniqueCarrier.values
########################## normalization
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
# Normalize Training Data
std_scale = preprocessing.StandardScaler().fit(df)
df_norm = std_scale.transform(df)
training_norm_col1 = pd.DataFrame(df_norm, index=df.index,
columns=df.columns)
df.update(training_norm_col1)
print (df.head())
########################## THE model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import numpy
class LossHistory(keras.callbacks.Callback):
def on_train_begin(self, logs={}):
self.losses = []
def on_batch_end(self, batch, logs={}):
self.losses.append(logs.get('loss'))
model = Sequential()
model.add(Dense(64, input_dim=4, activation='relu'))
model.add(Dense(30, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1))
# Compile model
model.compile(loss='mean_squared_error', optimizer='adam', metrics=
['accuracy'])
# Fit the model
history = LossHistory()
model.fit(df, y, validation_split=0.33, epochs=1000,
batch_size=50,verbose=1, callbacks=[history])
print(history.losses)
the accuracy is about : 0.3524 while training. THE DATAFRAME for traning is about 3M rows