0

I'm coding neural network and I have trouble with understanding and coding Backpropagation, but it doesn't learn properly. I don't know where is problem in my Backpropagation function.

This is my error function L = plogq + (1−p)log(1−q)

And my activation function is f(x)=1/(1+e^(-x))

I'm trying on easy problem x > y which doesn't really need multilayer perceptron.

#include <bits/stdc++.h>

using namespace std;
const double ETA=0.001;
const double EPS=0.000001;
const bool OUTPUT=true;
typedef function<double(vector<double>, vector<double>)> func;
double scalar_product(vector<double> a, vector<double> b)
{
    assert(a.size() == b.size());
    double ret = 0.0;
    for (int i=0; i<a.size(); i++)
    {
        ret += a[i] * b[i];
    }
    return ret;
}
class Perceptron
{
public:
    int n;
    double val,der;
    double delta;
    vector<int> next;
    vector<double> w;
    function<double(vector<double>, vector<double>)> h;
    function<double(vector<double>, vector<double>)> d;

    Perceptron(int n,func h,func d):n(n),h(h),d(d)
    {
        for(int i=0; i<n+1; i++)
            w.push_back(1.0*rand()/RAND_MAX);
    }
    double calc(vector<double> x)
    {
        val=h(w,x);
        der=d(w,x);
        return val;
    }
};
class NeuralNetwork
{
public:
    int inputLayer,hiddenLayer,outputLayer;
    vector<Perceptron> inputNeurons;
    vector<Perceptron> hiddenNeurons;
    vector<Perceptron> outputNeurons;
    NeuralNetwork(int in,int hid,int out)
    {
        inputLayer=in;
        hiddenLayer=hid;
        outputLayer=out;

        auto logistic = [] (vector<double> w, vector<double> x) -> double
        {
            x.push_back(1.0);
            return 1.0 / (1.0 + exp(-scalar_product(w, x)));
        };
        auto d_logistic = [logistic] (vector<double> w, vector<double> x) -> double
        {
            double lst = logistic(w, x);
            return lst * (1.0 - lst);
        };

        auto input = [] (vector<double> w, vector<double> x) -> double
        {
            return x[0];
        };
        auto d_input = [] (vector<double> w, vector<double> x) -> double
        {
            return 1.0;
        };

        auto ident = [] (vector<double> w, vector<double> x) -> double
        {
            x.push_back(1.0);
            return scalar_product(w, x);
        };
        auto d_ident = [] (vector<double> w, vector<double> x) -> double
        {
            return 1.0;
        };

        for(int i=0; i<inputLayer; i++)
            inputNeurons.push_back(Perceptron(1,input,d_input));
        if(OUTPUT)cout<<"Created "<<inputLayer<<" input neurons."<<endl;
        for(int i=0; i<hiddenLayer; i++)
            hiddenNeurons.push_back(Perceptron(inputLayer,logistic,d_logistic));
        if(OUTPUT)cout<<"Created "<<hiddenLayer<<" hidden neurons."<<endl;
        for(int i=0; i<outputLayer; i++)
            outputNeurons.push_back(Perceptron(hiddenLayer,logistic,d_logistic));
        if(OUTPUT)cout<<"Created "<<outputLayer<<" output neurons."<<endl;

    }
    vector<double> feedForward(vector<double> x)
    {
        for(int i=0; i<inputLayer; i++)
            inputNeurons[i].calc({x[i]});
        vector<double> inputLayerOut;
        for(int i=0; i<inputLayer; i++)
            inputLayerOut.push_back(inputNeurons[i].val);

        for(int i=0; i<hiddenLayer; i++)
            hiddenNeurons[i].calc(inputLayerOut);
        vector<double> hiddenLayerOut;
        for(int i=0; i<hiddenLayer; i++)
            hiddenLayerOut.push_back(hiddenNeurons[i].val);

        for(int i=0; i<outputLayer; i++)
            outputNeurons[i].calc(hiddenLayerOut);
        vector<double> outputLayerOut;
        for(int i=0; i<outputLayer; i++)
            outputLayerOut.push_back(outputNeurons[i].val);
        return outputLayerOut;
    }
    void backPropagation(vector<vector<double> > x, vector<vector<double> > y, int max_steps)
    {
        double diff;
        do
        {
            diff=0.0;
            for(int t=0; t<x.size(); t++)
            {
                vector<double> out=feedForward(x[t]);
                for(int i=0; i<outputLayer; i++)
                    outputNeurons[i].delta=(y[t][i]-outputNeurons[i].val);
                for(int i=0; i<hiddenLayer; i++)
                {
                    double sum=0.0;
                    for(int j=0; j<outputLayer; j++)
                        sum+=outputNeurons[j].delta*hiddenNeurons[i].w[j];
                    hiddenNeurons[i].delta=hiddenNeurons[i].der*sum;
                }
            }
            for(int i=0; i<outputLayer; i++)
            {
                for (int j=0; j<outputNeurons[i].w.size(); j++)
                {
                    double z = (j < outputNeurons[i].n) ? hiddenNeurons[j].val : 1.0;
                    double curr = ETA * outputNeurons[i].delta * z;
                    outputNeurons[i].w[j] += curr;
                    diff += curr * curr;
                }
            }
            for(int i=0; i<hiddenLayer; i++)
            {
                for (int j=0; j<hiddenNeurons[i].w.size(); j++)
                {
                    double z = (j < hiddenNeurons[i].n) ? inputNeurons[j].val : 1.0;
                    double curr = ETA * hiddenNeurons[i].delta * z;
                    hiddenNeurons[i].w[j] += curr;
                    diff += curr * curr;
                }
            }
            if(OUTPUT&&max_steps%100==0)cout<<"Current diff: "<<diff<<endl;
        }
        while(diff>EPS&&max_steps--);
    }

};

int main()
{
    //srand(time(NULL));
    NeuralNetwork nn=NeuralNetwork(2,5,1);
    vector<vector<double> > trainingInput;
    vector<vector<double> > trainingOutput;
    trainingInput.resize(100);
    trainingOutput.resize(100);
    for(int i=0; i<100; i++)
    {
        int x=rand()%100;
        int y=rand()%100;
        trainingInput[i].push_back(x);
        trainingInput[i].push_back(y);
        trainingOutput[i].push_back(x>y?1:0);
    }
    nn.backPropagation(trainingInput,trainingOutput,10000);
    while(true)
    {
        int x,y;
        cin>>x>>y;
        cout<<nn.feedForward({x,y})[0]<<endl;
    }
    return 0;
}
chema989
  • 3,962
  • 2
  • 20
  • 33
Filip V
  • 435
  • 4
  • 16
  • But, problem is because I don't understand backpropagination completely. – Filip V Jun 20 '16 at 21:19
  • I'm not sure is my formulas correct. – Filip V Jun 20 '16 at 21:20
  • I have posted whole code only for understanding notions, problem is somewhere in backPropagation function. – Filip V Jun 20 '16 at 21:22
  • Well, as mentioned use the debugger, step at that point and check your current variable values- – πάντα ῥεῖ Jun 20 '16 at 21:23
  • 1
    @πάνταῥεῖ "Don't ask questions" isn't useful advice on a site devoted to asking questions. – Robert Dodier Jun 20 '16 at 21:25
  • But I don't understand completely mathematical background and I can't know what variable values are ok, and what are not. – Filip V Jun 20 '16 at 21:25
  • Off topic: The usual commentary. http://stackoverflow.com/questions/1452721/why-is-using-namespace-std-in-c-considered-bad-practice and http://stackoverflow.com/questions/31816095/why-should-i-not-include-bits-stdc-h – user4581301 Jun 20 '16 at 21:36
  • offtopic: Well this is my habit from competitive programming skills. – Filip V Jun 20 '16 at 21:48
  • 1
    Weights and the activation function's input can be negative, even when the final activation is not. So when you initialize your weights, half the possible range should be negative. Also, I don't like the big initial weights with a mean absolute value of 0.5. This just puts most initial activations on one of the extremes of the sigmoid, saturated at 0 or 1, and with a derivative somewhere close to zero -- so updates will change weights very little, or not at all. Your initial weights, with typical inputs, should produce activations between 0.1 and 0.9, so scale them down. – Christopher Oicles Jun 20 '16 at 23:03

1 Answers1

1

For backpropagation code, the key point is to calculate the gradient correctly. You can check the gradient by computing a numerical approximation and comparing that. A numerical approximation to the gradient is just (F(x; w + dw) - F(x; w))/dw where x is an input vector (could be anything, doesn't matter for this purpose), w is one of the weights or biases, and dw is a small number (e.g. 1e-4). For each weight and bias, compute that difference, then compare the vector of approximations to whatever you are getting from your code. They should be approximately the same, and should get closer as you make dw smaller.

Robert Dodier
  • 16,905
  • 2
  • 31
  • 48
  • (F(x; w + dw) - F(x; w))/dw is derivate definiton. I don't know where to apply this and where from code? – Filip V Jun 20 '16 at 21:33
  • Do you have some material which describe backpropagination in good way? – Filip V Jun 20 '16 at 21:34
  • Take a look at, say, "Pattern Recognition and Neural Networks" by Brian Ripley, and/or "Elements of Statistical Learning" by Hastie, Tibshirani, and Friedman. – Robert Dodier Jun 20 '16 at 23:32