So, I started CUDA programming recently.
and I tried to make a program that start more than one thread, go into an array of global memory and the order in which it started.
However, part of the exclusive control does not seem to be working well.
I want to prevent multiple threads to simultaneously access the array Log.
now, the array Log is like this.
Log[0]=160
Log[1]=128
Log[2]=256
Log[3]=96
Log[4]=0
Log[5]=0
Log[6]=0
...etc
I want to prevent multiple threads simultaneously access to exclusive control of the memory array Log.
Is it doing wrong how to use "__threadfence ()" of?
I use CUDA5.5 and compute capability is 2.1.
Please advice someone.
Following is the source code.
#include <cuda_runtime.h>
#include <stdio.h>
#include <cuda.h>
#include <cstdio>
#include <thrust/device_ptr.h>
#define N 256
//Prototype declaration
__global__ void CudaThreadfenceTest(int *Log_d);
int main(){
int i,j;
int Log[N];
int *Log_d;
//
for(j=0;j<N;j++){
Log[j]=0;
}
// GPU memory hold
cudaMalloc((void**)&Log_d, N*sizeof(int));
// host→device
cudaMemcpy(Log_d,Log,N*sizeof(int),cudaMemcpyHostToDevice);
/*****************
*block & thread
******************/
dim3 blocks(1,1,1);
dim3 threads(256,1,1);
//run kernel
CudaThreadfenceTest<<<blocks,threads>>>(Log_d);
cudaDeviceSynchronize();
cudaMemcpy(Log,Log_d,N*sizeof(int),cudaMemcpyDeviceToHost);
for(j=0;j<N;j++){
printf("Log[ %d ]=%d \n",j,Log[j]);
}
getchar();
cudaFree(Log_d);
return 0;
}
/*************************
/* kernel
/*************************/
__global__ void CudaThreadfenceTest(int *Log_d){
printf("threadIdx.x = %d , \n",threadIdx.x);
__threadfence();
//for Log
for(int j=0;j<N;j++){
if(Log_d[j]==0){
Log_d[j]=threadIdx.x + 1;
break;
}
}
}