I'll try to address some of your problems in a simple solution:
First problem you have is space. Since you need numbers from 1-10 only, a int8_t would serve you much better.
Second is speed. std::vector
does a lot of allocations and reallocations behind the hood. Since you have a fixed size, In my opinion there's no need to use it. Knowing this, we'll use a simple array and threads to improve performance.
Here's the code:
#include <array>
#include <random>
#include <thread>
#include <cstdint>
#include <memory>
#include <chrono>
// Since you only need numbers from 1-10, a single byte will work nicely.
const uint64_t size = UINT64_C(0x800000000); // Exactly 2^35
typedef std::array<int8_t, size> vec_t;
// start is first element, end is one-past the last. This is a template so we can generate multiple functions.
template<unsigned s>
void fill(vec_t::iterator start, vec_t::iterator end) {
static const int seed = std::chrono::system_clock::now().time_since_epoch().count()*(s+1);
static std::default_random_engine generator(seed);
static std::uniform_int_distribution<int8_t> distribution(1,10);
for(auto it = start; it != end; ++it) {
*it = distribution(generator); // generates number in the range 1..10
}
}
int main() {
auto vec = std::unique_ptr<vec_t>(new vec_t());
// Each will have its own generator and distribution.
std::thread a(fill<0>, vec->begin(), vec->begin() + size/4);
std::thread b(fill<1>, vec->begin() + size/4, vec->begin() + size/2);
std::thread c(fill<2>, vec->begin() + size/2, vec->begin() + (size/4)*3);
std::thread d(fill<3>, vec->begin() + (size/4)*3, vec->end());
a.join();
b.join();
c.join();
d.join();
return 0;
}