I implemented a regression model (that takes in 3 inputs and adds them at the output) in TensorFlow as:
hidden = tf.keras.layers.Dense(units=8,
input_shape=[3])
output = tf.keras.layers.Dense(units=1)
model = tf.keras.Sequential([hidden, output])
model.compile(loss='mean_squared_error')
history = model.fit(xtrain,ytrain,
epochs=500,
shuffle= True,
verbose=False)
print("Finished training the model")
And it performs excellently with just 8 training set of data and 500 epochs (By default uses Linear activation and no normalization).
When I try to do the same on a C/C++ low-level model (with only standard C/C++ libraries), with everything the same (Apparently), I get NAN
for all the weights as well as output.
/// Forward pass
for (int j=0; j<numHiddenNodes; j++)
{
double activation=hiddenLayerBias[j];
//std::cout<<"Training Set :"<<x<<"\n";
for (int k=0; k<numInputs; k++) {
activation+=training_inputs[x][k]*hiddenWeights[k][j];
}
hiddenLayer[j] = lin(activation);
}
for (int j=0; j<numOutputs; j++) {
double activation=outputLayerBias[j];
for (int k=0; k<numHiddenNodes; k++)
{
activation+=hiddenLayer[k]*outputWeights[k][j];
}
outputLayer[j] = lin(activation);
}
//std::cout << "Input:" << training_inputs[x][0] << " " << " Output:" << outputLayer[0] << " Expected Output: " << training_outputs[x][0] << "\n";
MSE += (1/numOutputs)*pow( training_outputs[x][0] - outputLayer[0], 2);
/// Backprop
/// For V
double deltaOutput[numOutputs];
for (int j=0; j<numOutputs; j++) {
double errorOutput = (training_outputs[i][j]-outputLayer[j]);
deltaOutput[j] = errorOutput*dlin(outputLayer[j]);
}
/// For W
double deltaHidden[numHiddenNodes];
for (int j=0; j<numHiddenNodes; j++) {
double errorHidden = 0.0f;
for(int k=0; k<numOutputs; k++) {
errorHidden+=deltaOutput[k]*outputWeights[j][k];
}
deltaHidden[j] = errorHidden*dlin(hiddenLayer[j]);
}
///Updation
/// For V and b
for (int j=0; j<numOutputs; j++) {
//b
outputLayerBias[j] += deltaOutput[j]*lr;
for (int k=0; k<numHiddenNodes; k++)
{
outputWeights[k][j]+= hiddenLayer[k]*deltaOutput[j]*lr;
}
The full file can be accessed here.
I don't seem to know the difference in the performance. By the way, my model works well with other activation functions like tanh but I switched to linear for experimentation.