I need to fill a huge (7734500 elements) std::vector<unsigned int>
with random values, and I am trying to do it in parallel with multiple threads to achieve a higher efficiency. Here is the code that I have so far:
std::random_device rd; // seed generator
std::mt19937_64 generator{rd()}; // generator initialized with seed from rd
static const unsigned int NUM_THREADS = 4;
std::uniform_int_distribution<> initialize(unsigned long long int modulus)
{
std::uniform_int_distribution<> unifDist{0, (int)(modulus-1)};
return unifDist;
}
void unifRandVectorThreadRoutine
(std::vector<unsigned int>& vector, unsigned int start,
unsigned int end, std::uniform_int_distribution<>& dist)
{
for(unsigned int i = start ; i < end ; ++i)
{
vector[i] = dist(generator);
}
}
std::vector<unsigned int> uniformRandomVector
(unsigned int rows, unsigned int columns, unsigned long long int modulus)
{
std::uniform_int_distribution<> dist = initialize(modulus);
std::thread threads[NUM_THREADS];
std::vector<unsigned int> v;
v.resize(rows*columns);
// number of entries each thread will take care of
unsigned int positionsEachThread = rows*columns/NUM_THREADS;
// all but the last thread
for(unsigned int i = 0 ; i < NUM_THREADS - 1 ; ++i)
{
threads[i] = std::thread(unifRandVectorThreadRoutine, v, i*positionsEachThread,
(i+1)*positionsEachThread, dist);
// threads[i].join();
}
// last thread
threads[NUM_THREADS - 1] = std::thread(unifRandVectorThreadRoutine, v,
(NUM_THREADS-1)*positionsEachThread, rows*columns, dist);
// threads[NUM_THREADS - 1].join();
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
threads[i].join();
}
return v;
}
At the moment, it takes approximately 0.3 seconds: do you think there is a way to make it more efficient?
Edit: Giving each thread its own generator
I have modified the routine as follows
void unifRandVectorThreadRoutine
(std::vector<unsigned int>& vector, unsigned int start,
unsigned int end, std::uniform_int_distribution<>& dist)
{
std::mt19937_64 generator{rd()};
for(unsigned int i = start ; i < end ; ++i)
{
vector[i] = dist(generator);
}
}
and the running time dropped by one half. So I am still sharing the std::random_device
but each thread has its own std::mt19937_64
.
Edit: Giving each thread its own vector and then concatenating
I changed the code as follows:
void unifRandVectorThreadRoutine
(std::vector<unsigned int>& vector, unsigned int length,
std::uniform_int_distribution<>& dist)
{
vector.reserve(length);
std::mt19937_64 generator{rd()};
for(unsigned int i = 0 ; i < length ; ++i)
{
vector.push_back(dist(generator));
}
}
and
std::vector<unsigned int> uniformRandomVector
(unsigned int rows, unsigned int columns, unsigned long long int modulus)
{
std::uniform_int_distribution<> dist = initialize(modulus);
std::thread threads[NUM_THREADS];
std::vector<unsigned int> v[NUM_THREADS];
unsigned int positionsEachThread = rows*columns/NUM_THREADS;
// all but the last thread
for(unsigned int i = 0 ; i < NUM_THREADS - 1 ; ++i)
{
threads[i] = std::thread(unifRandVectorThreadRoutine, std::ref(v[i]), positionsEachThread, dist);
}
// last thread
threads[NUM_THREADS - 1] = std::thread(unifRandVectorThreadRoutine, std::ref(v[NUM_THREADS-1]),
rows*columns - (NUM_THREADS-1)*positionsEachThread, dist);
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
threads[i].join();
}
std::vector<unsigned int> finalVector;
finalVector.reserve(rows*columns);
for(unsigned int i = 0 ; i < NUM_THREADS ; ++i)
{
finalVector.insert(finalVector.end(), v[i].begin(), v[i].end());
}
return finalVector;
}
The execution time is slightly worse than before, when I was using just one vector shared between all the threads. Am I missing something or can it just happen?
Edit: using a different PRNG + benchmarks
Using a different PRNG (as suggested in some comments/answers) helps a lot: I tried with the xorshift+
and here is the implementation I am using:
class xorShift128PlusGenerator
{
public:
xorShift128PlusGenerator()
{
state[0] = rd();
state[1] = rd();
};
unsigned long int next()
{
unsigned long int x = state[0];
unsigned long int const y = state[1];
state[0] = y;
x ^= x << 23; // a
state[1] = x ^ y ^ (x >> 17) ^ (y >> 26); // b, c
return state[1] + y;
}
private:
std::random_device rd; // seed generator
unsigned long int state[2];
};
Then the routine is as follows
void unifRandVectorThreadRoutine
(std::vector<unsigned int>& vector, unsigned int start,
unsigned int end)
{
xorShift128PlusGenerator prng;
for(unsigned int i = start ; i < end ; ++i)
{
vector[i] = prng.next();
}
}
Since I am now home and I am using a different (and more powerful) machine, I redid the tests to compare the results. Here is what I obtain:
- Mersenne Twister with one generator per thread: 0.075 seconds
- xorshift128+ shared between all threads: 0.023 seconds
- xorshift128+ with one generator per thread: 0.023 seconds
Note: the execution time varies at each repetition. These are just typical values.
So there seems to be no difference if the xorshift generator is shared or not, but with all these improvements the execution time has dropped significantly.