Neural Network only works sometimes

Question

So I finally coded a neural network (which I been wanting to do for ages) for the first time, with the help of the series from the youtube channel "Coding Train", with the "only" difference being I coded it in C instead of JS.

I tried to simulate an XOR: The structure is two input nodes, two hidden nodes, and one output node. After training, I found out it doesn't work properly.

Already seen this article and several others but they didn't help me: XOR Neural Network sometimes outputs 0.5

This is my training data:

Training data:
IN | OUT
00 | 0
01 | 1
10 | 1
11 | 0

I trained it for multiple sessions, training each session more than 10000 times with a learning rate ranging from 0.5 to 0.01 and expected their corresponding results. In the below table I listed the most common outputs, no matter what learning range I pick, after different training sessions.

One training session = many trainings

Actual output after training (OUTn is the nth training session):
IN | OUT1 | OUT2 | OUT3 | OUT4 | OUT5
00 | 0.01 | 0.01 | 0.01 | 0.66 | 0.01
01 | 0.99 | 0.99 | 0.50 | 0.66 | 0.66
10 | 0.99 | 0.50 | 0.99 | 0.66 | 0.66
11 | 0.01 | 0.50 | 0.50 | 0.66 | 0.66

Most of the time it outputs something pretty weird. And after several hours of error searching etc. I still couldn't figure out where the error is. Maybe someone reading this finds one?

Following is the code.

I defined a GETRANDOM and my network via a struct so that I can easily pass, modify, and return it:

#define GETRANDOM   ( (double)rand() / RAND_MAX * 2.0 - 1.0 )   // random number between -1 and 1

// network structure
struct sNetwork {
    // node count
    int input_nodes;
    int hidden_nodes;
    int output_nodes;
    // values
    double* input_values;
    double* hidden_values;
    double* output_values;
    double* expected_values;
    // error
    double* hidden_error;
    double* output_error;
    // bias
    double* bias_h;
    double* bias_o;
    // weights
    double** weights_ih;
    double** weights_ho;
};
typedef struct sNetwork tNetwork;

And for that I also wrote a setup-function:

tNetwork* setup_network(tNetwork* tNet)
{
    // general error check
    if(tNet == NULL)
    {
        return NULL;
    }
    if((*tNet).input_nodes == 0 || (*tNet).hidden_nodes == 0 || (*tNet).output_nodes == 0)
    {
        return NULL;
    }

    // based on the defined size, set up the weights

    // set up the input to hidden weights
    (*tNet).weights_ih = (double**)malloc((*tNet).input_nodes * sizeof(double*));
    for(int i = 0; i < (*tNet).input_nodes; i++)
    {
        (*tNet).weights_ih[i] = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            (*tNet).weights_ih[i][j] = GETRANDOM;
        }
    }

    // set up the hidden to output weights
    (*tNet).weights_ho = (double**)malloc((*tNet).hidden_nodes * sizeof(double*));
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).weights_ho[i] = (double*)malloc((*tNet).output_nodes * sizeof(double));
        for(int j = 0; j < (*tNet).output_nodes; j++)
        {
            (*tNet).weights_ho[i][j] = GETRANDOM;
        }
    }

    // set up the bias

    // set up hidden bias and value
    (*tNet).bias_h = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).bias_h[i] = GETRANDOM;
    }

    // set up the output bias and value

    (*tNet).bias_o = (double*)malloc((*tNet).output_nodes * sizeof(double));
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).bias_o[i] = GETRANDOM;
    }

    // set up the values
    (*tNet).hidden_values = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    (*tNet).output_values = (double*)malloc((*tNet).output_nodes * sizeof(double));
    (*tNet).input_values = (double*)malloc((*tNet).input_nodes * sizeof(double));
    (*tNet).expected_values = (double*)malloc((*tNet).output_nodes * sizeof(double));

    // set up the error stuff
    (*tNet).hidden_error = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    (*tNet).output_error = (double*)malloc((*tNet).output_nodes * sizeof(double));

    return tNet;
}

The sigmoid functions:

double sigmoid(double x)
{
    return 1 / (1 + exp(-x));
}

double dsigmoid(double x)
{
    return x * (1 - (x));
}

Then I coded the feed-forward function:

tNetwork* feed_forward(tNetwork* tNet)
{
    // calculate the hidden outputs
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).hidden_values[i] = (*tNet).bias_h[i];  // add bias to weighted sum

        for(int j = 0; j < (*tNet).input_nodes; j++)
        {
            (*tNet).hidden_values[i] += ( (*tNet).input_values[j] * (*tNet).weights_ih[j][i] ); // build the weighted sum
        }

        (*tNet).hidden_values[i] = sigmoid((*tNet).hidden_values[i]);
    }

    // calculate the output
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).output_values[i] = (*tNet).bias_o[i];  // add bias to weighted sum

        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            (*tNet).output_values[i] += ( (*tNet).hidden_values[j] * (*tNet).weights_ho[j][i] ); // build the weighted sum
        }
        (*tNet).output_values[i] = sigmoid((*tNet).output_values[i]);
    }

    return tNet;
}

After that the train function:

tNetwork* train(tNetwork* tNet, double learning_rate)
{
    // first of all feed the network
    tNet = feed_forward(tNet);

    // init the hidden errors
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).hidden_error[i] = 0;
    }

    // calculate the output error
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).output_error[i] = (*tNet).expected_values[i] - (*tNet).output_values[i];
    }

    // calculate the hidden error
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        for(int j = 0; j < (*tNet).output_nodes; j++)
        {
            (*tNet).hidden_error[i] += ( (*tNet).weights_ho[i][j] * (*tNet).output_error[j] );
        }
    }

    // adjust outputs
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        // adjust output bias
        double gradient = learning_rate * (*tNet).output_error[i] * dsigmoid((*tNet).output_values[i]);
        (*tNet).bias_o[i] += gradient;

        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            // adjust hidden->output weights
            (*tNet).weights_ho[j][i] += gradient * (*tNet).hidden_values[j];
        }
    }

    // adjust hiddens
    for(int j = 0; j < (*tNet).hidden_nodes; j++)
    {
        // adjust hidden bias
        double gradient = learning_rate * (*tNet).hidden_error[j] * dsigmoid((*tNet).hidden_values[j]);
        (*tNet).bias_h[j] += gradient;

        for(int k = 0; k < (*tNet).input_nodes; k++)
        {
            // adjust input->hidden weights
            (*tNet).weights_ih[k][j] += gradient * (*tNet).input_values[k];
        }
    }

    return tNet;
}

Finally, in my main function I did this:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

int main(void)
{
    // initialize
    srand(time(NULL));

    // create neural network
    tNetwork* network = (tNetwork*)malloc(sizeof(tNetwork));

    // set up the properties of the network and initialize it
    network->input_nodes = 2;
    network->hidden_nodes = 2;
    network->output_nodes  = 1;
    network = setup_network(network);

    // train
    for(int i = 0; i < 50000; i++)
    {
        switch(rand() % 4)
        {
            case 0:
                // train #1
                network->input_values[0] = 0;
                network->input_values[1] = 0;
                network->expected_values[0] = 0;
                network = train(network, 0.1);
                break;
            case 1:
                // train #2
                network->input_values[0] = 1;
                network->input_values[1] = 0;
                network->expected_values[0] = 1;
                network = train(network, 0.1);
                break;
            case 2:
                // train #3
                network->input_values[0] = 0;
                network->input_values[1] = 1;
                network->expected_values[0] = 1;
                network = train(network, 0.1);
                break;
            case 3:
                // train #4
                network->input_values[0] = 1;
                network->input_values[1] = 1;
                network->expected_values[0] = 0;
                network = train(network, 0.1);
                break;
            default:
                break;
        }
    }

    // check the functionality

    network->input_values[0] = 0;
    network->input_values[1] = 0;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 0;
    network->input_values[1] = 1;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 1;
    network->input_values[1] = 0;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 1;
    network->input_values[1] = 1;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    return 0;
}

If anyone actually read this far, I'm impressed and if there is any error spotted and explained, I'm very grateful, thanks in advance!!

Hello, I took a look at your main method and the trained model output ... it probably is good. A neural network would never give exact answers in most cases, i.e, it will give out probabilities. So why don't you use softmax activation at the last (output) layer? Then the node with the highest activation is your answer — mettleap, Jun 22 '20 at 03:01
In your current case, when the output neuron is less than 0.5, it means a 0 and it means a 1 otherwise — mettleap, Jun 22 '20 at 03:05
@mettleap Hi, thanks for responding. Sure, I want to try that. But first of all, I have to find out what a softmax even is.. :D (as said, this is the very first nn-algo I wrote, but I really want to get deeper into that topic) — rphii, Jun 22 '20 at 03:06
Could you give the outputs after say 100th training session? In a single training session, the network must see all possible inputs and their outputs and then learn from them. In the current training session that you have coded, the network only sees a single input-output pair. Maybe running the for loop for some more while may help. So 5 loops may be inadequate, because the NN never might see some cases — mettleap, Jun 22 '20 at 03:15
I came back, slept in, to check if there is a new answer. Just to clear up one possible misunderstanding: I defined one training session as (in my code) 10000 training runs. And I got the different outputs when running the program multiple times (each time training 10000 times). Maybe you interpreted one training session as training the network once? Or do you mean I should train my network 100 x 10000 times ? — rphii, Jun 22 '20 at 11:12

score 2 · Accepted Answer · answered Jun 22 '20 at 16:42

I think your code is pretty fine (I am no good at programming in C ... I come from a Java background) and that the output isn't coming because some manual tuning is needed. For example, I think that the training loop should run longer and instead of choosing randomly between 4 cases, all the cases should be chosen and trained on (this is because if we choose training samples randomly, some cases might get picked up a lot more than others causing the network to learn incorrectly). I tweaked your code to fix these (along with increasing the learning rate to 0.2) and I am getting good classifications almost always. Please try out the following code,

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define GETRANDOM   ( (double)rand() / RAND_MAX * 2.0 - 1.0 )   // random number between -1 and 1

// network structure
struct sNetwork {
    // node count
    int input_nodes;
    int hidden_nodes;
    int output_nodes;
    // values
    double* input_values;
    double* hidden_values;
    double* output_values;
    double* expected_values;
    // error
    double* hidden_error;
    double* output_error;
    // bias
    double* bias_h;
    double* bias_o;
    // weights
    double** weights_ih;
    double** weights_ho;
};
typedef struct sNetwork tNetwork;

tNetwork* setup_network(tNetwork* tNet)
{
    // general error check
    if(tNet == NULL)
    {
        return NULL;
    }
    if((*tNet).input_nodes == 0 || (*tNet).hidden_nodes == 0 || (*tNet).output_nodes == 0)
    {
        return NULL;
    }

    // based on the defined size, set up the weights

    // set up the input to hidden weights
    (*tNet).weights_ih = (double**)malloc((*tNet).input_nodes * sizeof(double*));
    for(int i = 0; i < (*tNet).input_nodes; i++)
    {
        (*tNet).weights_ih[i] = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            (*tNet).weights_ih[i][j] = GETRANDOM;
        }
    }

    // set up the hidden to output weights
    (*tNet).weights_ho = (double**)malloc((*tNet).hidden_nodes * sizeof(double*));
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).weights_ho[i] = (double*)malloc((*tNet).output_nodes * sizeof(double));
        for(int j = 0; j < (*tNet).output_nodes; j++)
        {
            (*tNet).weights_ho[i][j] = GETRANDOM;
        }
    }

    // set up the bias

    // set up hidden bias and value
    (*tNet).bias_h = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).bias_h[i] = GETRANDOM;
    }

    // set up the output bias and value

    (*tNet).bias_o = (double*)malloc((*tNet).output_nodes * sizeof(double));
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).bias_o[i] = GETRANDOM;
    }

    // set up the values
    (*tNet).hidden_values = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    (*tNet).output_values = (double*)malloc((*tNet).output_nodes * sizeof(double));
    (*tNet).input_values = (double*)malloc((*tNet).input_nodes * sizeof(double));
    (*tNet).expected_values = (double*)malloc((*tNet).output_nodes * sizeof(double));

    // set up the error stuff
    (*tNet).hidden_error = (double*)malloc((*tNet).hidden_nodes * sizeof(double));
    (*tNet).output_error = (double*)malloc((*tNet).output_nodes * sizeof(double));

    return tNet;
}

double sigmoid(double x)
{
    return 1 / (1 + exp(-x));
}

double dsigmoid(double x)
{
    return x * (1 - (x));
}

tNetwork* feed_forward(tNetwork* tNet)
{
    // calculate the hidden outputs
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).hidden_values[i] = (*tNet).bias_h[i];  // add bias to weighted sum

        for(int j = 0; j < (*tNet).input_nodes; j++)
        {
            (*tNet).hidden_values[i] += ( (*tNet).input_values[j] * (*tNet).weights_ih[j][i] ); // build the weighted sum
        }

        (*tNet).hidden_values[i] = sigmoid((*tNet).hidden_values[i]);
    }

    // calculate the output
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).output_values[i] = (*tNet).bias_o[i];  // add bias to weighted sum

        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            (*tNet).output_values[i] += ( (*tNet).hidden_values[j] * (*tNet).weights_ho[j][i] ); // build the weighted sum
        }
        (*tNet).output_values[i] = sigmoid((*tNet).output_values[i]);
    }

    return tNet;
}

tNetwork* train(tNetwork* tNet, double learning_rate)
{
    // first of all feed the network
    tNet = feed_forward(tNet);

    // init the hidden errors
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        (*tNet).hidden_error[i] = 0;
    }

    // calculate the output error
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        (*tNet).output_error[i] = ((*tNet).expected_values[i] - (*tNet).output_values[i]);
    }

    // calculate the hidden error
    for(int i = 0; i < (*tNet).hidden_nodes; i++)
    {
        for(int j = 0; j < (*tNet).output_nodes; j++)
        {
            (*tNet).hidden_error[i] += ( (*tNet).weights_ho[i][j] * (*tNet).output_error[j] );
        }
    }

    // adjust outputs
    for(int i = 0; i < (*tNet).output_nodes; i++)
    {
        // adjust output bias
        double gradient = learning_rate * (*tNet).output_error[i] * dsigmoid((*tNet).output_values[i]);
        (*tNet).bias_o[i] += gradient;

        for(int j = 0; j < (*tNet).hidden_nodes; j++)
        {
            // adjust hidden->output weights
            (*tNet).weights_ho[j][i] += gradient * (*tNet).hidden_values[j];
        }
    }

    // adjust hiddens
    for(int j = 0; j < (*tNet).hidden_nodes; j++)
    {
        // adjust hidden bias
        double gradient = learning_rate * (*tNet).hidden_error[j] * dsigmoid((*tNet).hidden_values[j]);
        (*tNet).bias_h[j] += gradient;

        for(int k = 0; k < (*tNet).input_nodes; k++)
        {
            // adjust input->hidden weights
            (*tNet).weights_ih[k][j] += gradient * (*tNet).input_values[k];
        }
    }

    return tNet;
}

int main(void)
{
    // initialize
    srand(time(NULL));

    // create neural network
    tNetwork* network = (tNetwork*)malloc(sizeof(tNetwork));

    // set up the properties of the network and initialize it
    network->input_nodes = 2;
    network->hidden_nodes = 2;
    network->output_nodes  = 1;
    network = setup_network(network);

    // train
    for(int i = 0; i < 10000; i++)
    {
        double learnRate = 0.2;
        network->input_values[0] = 0;
        network->input_values[1] = 0;
        network->expected_values[0] = 0;
        network = train(network, learnRate);
        
        network->input_values[0] = 1;
        network->input_values[1] = 0;
        network->expected_values[0] = 1;
        network = train(network, learnRate);
        
        network->input_values[0] = 0;
        network->input_values[1] = 1;
        network->expected_values[0] = 1;
        network = train(network, learnRate);
        
        network->input_values[0] = 1;
        network->input_values[1] = 1;
        network->expected_values[0] = 0;
        network = train(network, learnRate);
    }

    // check the functionality

    network->input_values[0] = 0;
    network->input_values[1] = 0;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 0;
    network->input_values[1] = 1;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 1;
    network->input_values[1] = 0;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    network->input_values[0] = 1;
    network->input_values[1] = 1;
    network = feed_forward(network);
    printf("%f\n", network->output_values[0]);

    return 0;
}

First run:

Second run:

Third run: (Sometimes the network doesn't learn, but this is ok I guess since there is no guarantee that an NN will always learn everything)

In order to increase the chances that our network will learn better, we can increase the hidden layer size (eg, from 2 to 10) and run the training for loop for 100000 number of times. In that case, the classification results are better,

First run:

Second run:

Third run:

Neural Network only works sometimes

1 Answers1