I am new with cuda and I have a problem. I want to put a synchronization to my threads so I tried to use syncthreads. The problem is that Visual Studio 2010 says: idetifier __syncthreads() is undefined... I am using cuda 4.2 by the way. So I decided to use cudaDeviceSynchronize() instead and call it from host. My code is something like the above (i send to you only the important parts):
__global__ void sum( float avg[]){
avg[0]+=1;
avg[1]+=2;
}
int main(){
float avg[2];
float *devAvg;
cudaError_t cudaStatus;
size_t size=sizeof(unsigned char)*2;
cudaStatus = cudaMalloc((void**)&devAvg, size2);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc 2 failed!");
return -1;
}
avg[0]=0;
avg[1]=0;
cudaStatus = cudaMemcpy(devAvg,avg, size, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
return -1;
}
dim3 nblocks(40,40);
dim3 nthreads(20,20);
sum<<<nblocks,nthreads,msBytes>>>(devAvg);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
}
cudaStatus = cudaMemcpy(avg,devAvg,size,cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy Device to Host failed!");
return -1;}
cout<<"avg[0]="avg[0]<<" avg[1]="<<avg[1]<<endl;
cudaFree devAvg;
return 0;
}
I thought that the results should be avg[0]=640.000 avg[1]=1.280.000
but not only my results are different(this could be an overflow problem) but they does not be stable. For example for three different executions the results are:
avg[0]=3041 avg[1]=6604
avg[0]=3015 avg[1]=6578
avg[0]=3047 avg[1]=6600
So what I am doing wrong here?Is it a synchronization problem?And why I cannot use __syncthreads() Or is it the problem of race conditions?
Additionally for the __syncthreads() problem it comes with any code that I write. Even the simplest one:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <Windows.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
__syncthreads();
}
// main routine that executes on the host
int main(void)
{
float *a_h, *a_d; // Pointer to host & device arrays
const int N = 10; // Number of elements in arrays
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
cudaMalloc((void **) &a_d, size); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
// Do calculation on device:
int block_size = 4;
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// Print results
for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
// Cleanup
free(a_h); cudaFree(a_d);
return 0;
}
It is saying this: Error: identifier "__syncthreads()" is undefined
The funny part is that even with the sample codes that comes with the 4.2 CUDA SDK the same thing happens... Maybe is something more general wrong because there are more functions in the SDK samples that are considered undefined.