I have some expensive computation I want to divide and distribute over a set of threads. I dumbed down my code to a minimal example where this is still happening.
In short:
I have N tasks that I want to divide into "Threads" threads.
Each task is the following simple function of running a bunch of simple mathematical operations. (In practice I verify asymmetric signatures here, but I excluded that for the sake of simplification)
while (i++ < 100000)
{
for (int y = 0; y < 1000; y++)
{
sqrt(y);
}
}
Running the above code with 1 thread results in 0.36 seconds per operation (outermost for loop), and thus in around 36 seconds overall execution time.
Thus, parallelization seemed like an obvious way to speed it up. However, with two threads the operation time rises to 0.72 seconds completely destroying any speed up.
Adding more threads results usually in an increasingly worse performance.
I got a Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz with 6 physical cores. So I'd expect a performance boost at least using when going from 1 to 2 threads. But in fact each operation slows down when increasing the number of threads.
Am I doing something wrong?
Full code:
using namespace std;
const size_t N = 100;
const size_t Threads = 1;
atomic_int counter(0);
struct ThreadData
{
int index;
int count;
ThreadData(const int index, const int count): index(index), count(count){};
};
void *executeSlave(void *threadarg)
{
struct ThreadData *my_data;
my_data = static_cast<ThreadData *>(threadarg);
for( int x = my_data->index; x < my_data->index + my_data->count; x++ )
{
cout << "Thread: " << my_data->index << ": " << x << endl;
clock_t start, end;
start = clock();
int i = 0;
while (i++ < 100000)
{
for (int y = 0; y < 1000; y++)
{
sqrt(y);
}
}
counter.fetch_add(1);
end = clock();
cout << end - start << ':' << CLOCKS_PER_SEC << ':' << (((float) end - start) / CLOCKS_PER_SEC)<< endl;
}
pthread_exit(NULL);
}
int main()
{
clock_t start, end;
start = clock();
pthread_t threads[Threads];
vector<ThreadData> td;
td.reserve(Threads);
int each = N / Threads;
cout << each << endl;
for (int x = 0; x < Threads; x++) {
cout << "main() : creating thread, " << x << endl;
td[x] = ThreadData(x * each, each);
int rc = pthread_create(&threads[x], NULL, executeSlave, (void *) &td[x]);
if (rc) {
cout << "Error:unable to create thread," << rc << endl;
exit(-1);
}
}
while (counter < N) {
std::this_thread::sleep_for(10ms);
}
end = clock();
cout << "Final:" << endl;
cout << end - start << ':' << CLOCKS_PER_SEC << ':' << (((float) end - start) / CLOCKS_PER_SEC)
<< endl;
}