I have a code of which a section needs to be executed critically. I am using a lock for that piece of code so that each thread of the kernel (set up with one thread per block) executes that piece of code atomically. The order of the threads is what bothers me - I need the threads to execute in chronological order according to their indices (or actually, in order of their blockIdx), from 0 to say 10 (instead of randomly e.g. 5, 8, 3, 0, ...etc). Is it possible to do that?
Here is an example code:
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<math_functions.h>
#include<time.h>
#include<cuda.h>
#include<cuda_runtime.h>
// number of blocks
#define nob 10
struct Lock{
int *mutex;
Lock(void){
int state = 0;
cudaMalloc((void**) &mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void){
cudaFree(mutex);
}
__device__ void lock(void){
while(atomicCAS(mutex, 0, 1) != 0);
}
__device__ void unlock(void){
atomicExch(mutex, 0);
}
};
__global__ void theKernel(Lock myLock){
int index = blockIdx.x; //using only one thread per block
// execute some parallel code
// critical section of code (thread with index=0 needs to start, followed by index=1, etc.)
myLock.lock();
printf("Thread with index=%i inside critical section now...\n", index);
myLock.unlock();
}
int main(void)
{
Lock myLock;
theKernel<<<nob, 1>>>(myLock);
return 0;
}
which gives the following results:
Thread with index=1 inside critical section now...
Thread with index=0 inside critical section now...
Thread with index=5 inside critical section now...
Thread with index=9 inside critical section now...
Thread with index=7 inside critical section now...
Thread with index=6 inside critical section now...
Thread with index=3 inside critical section now...
Thread with index=2 inside critical section now...
Thread with index=8 inside critical section now...
Thread with index=4 inside critical section now...
I want these indices to start from 0 and execute chronologically to 9.
One way I thought to modify the Lock to achieve this is as follows:
struct Lock{
int *indexAllow;
Lock(void){
int startVal = 0;
cudaMalloc((void**) &indexAllow, sizeof(int));
cudaMemcpy(indexAllow, &startVal, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void){
cudaFree(indexAllow);
}
__device__ void lock(int index){
while(index!=*indexAllow);
}
__device__ void unlock(void){
atomicAdd(indexAllow,1);
}
};
and then to just initialize the lock by passing the index as an argument:
myLock.lock(index);
but this stalls my pc... I'm probably missing something obvious.
If anyone can help I'd appreciate it!
Thanks!!!