I am trying to implement multithreading to a CPU intensive simulation that normally takes around 10hrs to run. I wrote a dummy code which works fine for multithreading and uses the specified number of CPU threads. But when I implement the same thing in my main program, it uses only one thread. What could be the reason for this?
My code: photongen()
void photongen() {
// Iterating over Muon list
for (auto m = muonList.begin(); m != muonList.end(); ++m) {
nP = 0; // Initialise the number of photons to 0
vector<Photon> tempP; // Temporary vector to store generated photons
// Muon propagation
bool stopLoop = false;
while (!stopLoop) {
// Photon generation
if (nPhotons()) {
for (int i = 0; i < nPhotons(); i++){
// Create photon instance
Photon *P = new Photon(m->x, m->y, m->z, m->run, m->event, i);
tempP.push_back(*P);
delete P; // Free memory by photon object
}
}
// Check if muon has left the detector
if (m->x > S_LENGTH/2. - PATH_STEP*sin(m->theta)*cos(m->phi) || m->x < -S_LENGTH/2. - PATH_STEP*sin(m->theta)*cos(m->phi)) {
stopLoop = true;
} else if (m->y > S_WIDTH/2. - PATH_STEP*sin(m->theta)*sin(m->phi) || m->y < -S_WIDTH/2. - PATH_STEP*sin(m->theta)*sin(m->phi)) {
stopLoop = true;
} else if (m->z > S_HEIGHT - PATH_STEP*cos(m->theta)) {
stopLoop = true;
} else {
// Update Muon path
m->x += PATH_STEP*sin(m->theta)*cos(m->phi);
m->y += PATH_STEP*sin(m->theta)*sin(m->phi);
m->z += PATH_STEP*cos(m->theta);
}
}
multiprocess(8, tempP, &propagation);
tempP.clear();
}
}
propagation()
void propagation(Photon *P) {
if (photonProp(P)) {
pLock.lock();
photonList.push_back(*P);
nP++;
pLock.unlock();
}
}
photonProp()
bool photonProp(Photon*& P) {
float dist[3];
for (int i = 0; i < MAX_REFL; i++) {
// Precalculation for optimisation
float theta = P->theta;
float phi = P->phi;
float sintheta = sin(theta);
float costheta = cos(theta);
float sinphi = sin(phi);
float cosphi = cos(phi);
float P_x = P->x;
float P_y = P->y;
float P_z = P->z;
// Find plane of intersection of particle
dist[0] = (sintheta*cosphi > 0) ? (S_LENGTH/2. - P_x)/(sintheta*cosphi) : (-S_LENGTH/2. - P_x)/(sintheta*cosphi);
dist[1] = (sintheta*sinphi > 0) ? (S_WIDTH/2. - P_y)/(sintheta*sinphi) : (-S_WIDTH/2. - P_y)/(sintheta*sinphi);
dist[2] = (costheta > 0) ? (S_HEIGHT - P_z)/costheta : -P_z/costheta;
// Calculate the plane of intersection
int min_index = distance(dist,min_element(dist, dist + 3));
float d = dist[min_index]; // distance from original point to plane
// Check for attenuation
float lambda_sc = 100.;
float x_surv = -lambda_sc*log(r->Uniform());
if (d > x_surv) return false;
// Update particle coords
P->x = P_x + d*sintheta*cosphi;
P->y = P_y + d*sintheta*sinphi;
P->z = P_z + d*costheta;
// check if particle hit photodetector
if (min_index == 0 && sintheta*cosphi > 0 && P->y <= -S_WIDTH/2. + 1) {
return true;
}
// Check for reflective loss
if (r->Uniform() < 0.05) return false;
// reflect particle
if (min_index == 0) {
P->phi = M_PI - phi;
} else if (min_index == 1) {
P->phi = -phi;
} else if (min_index == 2) {
P->theta = -theta;
}
}
return false;
}
multiprocess()
template <typename T>
void multiprocess(int nThreads, vector<T> arg, void (*func)(T*)) {
size_t i;
for (i=0; i < arg.size() - nThreads; i += nThreads) {
vector<thread> t(nThreads);
for (int j = 0; j < nThreads; j++){
t[j] = thread(func, &arg[i+j]);
}
for (int j = 0; j < nThreads; j++){
t[j].join();
}
}
vector<thread> t(arg.size()%nThreads);
for (std::size_t j = 0; j < arg.size()%nThreads; j++){
t[j] = thread(func, &arg[i+j]);
}
for (std::size_t j = 0; j < arg.size()%nThreads; j++){
t[j].join();
}
}