I'm a newbie in CPU multithreading algorithms and I'm trying to implement standard three layered neural network parallel feed forward algorithm. The problem is in that parallel version is about 10 times slower... I think the reason is in executing too many threads. I'm using Intel i7 920 with 4 cores, hyperthreading disabled. OS Fedora 20, Compiler GCC 4.8.2
Any ideas how to improve performance ?
template<class T, class TM>
void NeuralNetwork<T, TM>::feedForwardParallel()
{
thread t0(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 0);
thread t1(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, block_size0);
thread t2(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 2*block_size0);
thread t3(&NeuralNetwork<T, TM>::parallel_sum0, this, block_size0, 3*block_size0);
t0.join();
t1.join();
t2.join();
t3.join();
thread t4(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 0);
thread t5(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, block_size1);
thread t6(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 2*block_size1);
thread t7(&NeuralNetwork<T, TM>::parallel_sum1, this, block_size1, 3*block_size1);
t4.join();
t5.join();
t6.join();
t7.join();
thread t8 (&NeuralNetwork<T, TM>::parallel_sum2, this, 1, 0);
thread t9 (&NeuralNetwork<T, TM>::parallel_sum2, this, 1, 1);
t8.join();
t9.join();
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum0(int size, int start)
{
T sum = 0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < INPUT_NEURONS; j++)
sum += inputN[j] * weightsIH[j][i];
sum += weightsIH[INPUT_NEURONS][i];
hidden1N[i] = sigmoid(sum);
}
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum1(int size, int start)
{
T sum = 0.0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < HIDDEN_NEURONS1; j++)
sum += hidden1N[j] * weightsHH[j][i];
sum += weightsHH[HIDDEN_NEURONS1][i];
hidden2N[i] = sigmoid(sum);
}
}
template <class T, class TM>
void NeuralNetwork<T, TM>::parallel_sum2(int size, int start)
{
T sum = 0.0;
for (int i = start; i < start+size; i++)
{
for (int j = 0; j < HIDDEN_NEURONS2; j++)
sum += hidden2N[j] * weightsHO[j][i];
sum += weightsHO[HIDDEN_NEURONS2][i];
outputN[i] = sigmoid(sum);
}
}
template<class T, class TM>
T NeuralNetwork<T, TM>::sigmoid(T val) {
return tanh(val);
}