1

I'm a newbie in CPU multithreading algorithms and I'm trying to implement standard three layered neural network parallel feed forward algorithm. The problem is in that parallel version is about 10 times slower... I think the reason is in executing too many threads. I'm using Intel i7 920 with 4 cores, hyperthreading disabled. OS Fedora 20, Compiler GCC 4.8.2

Any ideas how to improve performance ?

template<class T, class TM>
void NeuralNetwork<T, TM>::feedForwardParallel()
{
    thread t0(&NeuralNetwork<T, TM>::parallel_sum0, this,  block_size0, 0);
    thread t1(&NeuralNetwork<T, TM>::parallel_sum0, this,  block_size0, block_size0);
    thread t2(&NeuralNetwork<T, TM>::parallel_sum0, this,  block_size0, 2*block_size0);
    thread t3(&NeuralNetwork<T, TM>::parallel_sum0, this,  block_size0, 3*block_size0);

    t0.join();
    t1.join();
    t2.join();
    t3.join();

    thread t4(&NeuralNetwork<T, TM>::parallel_sum1, this,  block_size1, 0);
    thread t5(&NeuralNetwork<T, TM>::parallel_sum1, this,  block_size1, block_size1);
    thread t6(&NeuralNetwork<T, TM>::parallel_sum1, this,  block_size1, 2*block_size1);
    thread t7(&NeuralNetwork<T, TM>::parallel_sum1, this,  block_size1, 3*block_size1);

    t4.join();
    t5.join();
    t6.join();
    t7.join();

    thread t8 (&NeuralNetwork<T, TM>::parallel_sum2, this,  1, 0);
    thread t9 (&NeuralNetwork<T, TM>::parallel_sum2, this,  1, 1);

    t8.join();
    t9.join();
}


template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum0(int size, int start)
{
    T sum = 0;

    for (int i = start; i < start+size; i++)
    {
        for (int j = 0; j < INPUT_NEURONS; j++)
            sum += inputN[j] * weightsIH[j][i];

        sum += weightsIH[INPUT_NEURONS][i];
        hidden1N[i] = sigmoid(sum);
    }
}

template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum1(int size, int start)
{
    T sum = 0.0;

    for (int i = start; i < start+size; i++)
    {
        for (int j = 0; j < HIDDEN_NEURONS1; j++)
            sum += hidden1N[j] * weightsHH[j][i];

        sum += weightsHH[HIDDEN_NEURONS1][i];
        hidden2N[i] = sigmoid(sum);
    }
}

template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum2(int size, int start)
{
    T sum = 0.0;
    for (int i = start; i < start+size; i++)
    {
        for (int j = 0; j < HIDDEN_NEURONS2; j++)
            sum += hidden2N[j] * weightsHO[j][i];

        sum += weightsHO[HIDDEN_NEURONS2][i];
        outputN[i] = sigmoid(sum);
    }
}

template<class T, class TM>
T NeuralNetwork<T, TM>::sigmoid(T val) {
    return tanh(val);
}
  • Hi the censors, he is a newbie, his question is not so badly described ... – Jean Davy Feb 02 '14 at 13:15
  • 1
    You can get the maxmimum count of hardware threads with std::thread::hardware_concurrency. You have to "push" your threads in a thread pool, 'taskqueue' lib that you can found on git-hub is an easy one to start. It leverages on boost::asio which offers great tools to achieve your goals. – Jean Davy Feb 02 '14 at 13:26

1 Answers1

0

New threads creation is very expensive and the cost of creating new thread is simple bigger than calculating everything you need in a single thread. Instead, you should use thread pools (for example, OpenMP do this for you automatically). Or you may use std::async with std::launch::async | std::launch::deferred flags instead.

Alex Telishev
  • 2,264
  • 13
  • 15