Ok, I'm pretty new into CUDA, and I'm kind of lost, really lost.
I'm trying to calculate pi using the Monte Carlo Method, and at the end I just get one add instead of 50.
I don't want to "do while" for calling the kernel, since it's too slow. My issue is, that my code don't loop, it executes only once in the kernel.
And also, I'd like that all the threads access the same niter and pi, so when some thread hit the counters all the others would stop.
#define SEED 35791246
__shared__ int niter;
__shared__ double pi;
__global__ void calcularPi(){
double x;
double y;
int count;
double z;
count = 0;
niter = 0;
//keep looping
do{
niter = niter + 1;
//Generate random number
curandState state;
curand_init(SEED,(int)niter, 0, &state);
x = curand(&state);
y = curand(&state);
z = x*x+y*y;
if (z<=1) count++;
pi =(double)count/niter*4;
}while(niter < 50);
}
int main(void){
float tempoTotal;
//Start timer
clock_t t;
t = clock();
//call kernel
calcularPi<<<1,32>>>();
//wait while kernel finish
cudaDeviceSynchronize();
typeof(pi) piFinal;
cudaMemcpyFromSymbol(&piFinal, "pi", sizeof(piFinal),0, cudaMemcpyDeviceToHost);
typeof(niter) niterFinal;
cudaMemcpyFromSymbol(&niterFinal, "niter", sizeof(niterFinal),0, cudaMemcpyDeviceToHost);
//Ends timer
t = clock() - t;
tempoTotal = ((double)t)/CLOCKS_PER_SEC;
printf("Pi: %g \n", piFinal);
printf("Adds: %d \n", niterFinal);
printf("Total time: %f \n", tempoTotal);
}