I am a beginner in CUDA and I tried this example code.
int main()
{
int i;
cudaError_t cudastatus;
float in[9]={1,2,3,4,5,6,7,8,9};
float* h_in=in;
float* d_in={0};
cudaMalloc((void**)&d_in,9*sizeof(float));
cudaCheckErrors("malloc failed");
cudastatus=cudaMemcpy(d_in,h_in,9*sizeof(float),cudaMemcpyHostToDevice);
cudaCheckErrors("memcpyh2d failed");
float* d_out={0};
cudaMalloc((void**)&d_out,9*sizeof(float));
float* out[9]={0};
kernel<<<3,3>>>(d_in,d_out);
cudastatus=cudaDeviceSynchronize();
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
getchar();
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
getchar();
}
cudastatus=cudaMemcpy(out,d_out,9*sizeof(float),cudaMemcpyDeviceToHost);
cudaCheckErrors("memcpyd2h failed");
for(i=0;i<9;i++)
{
printf("%f\n",out[i]);
}
getchar();
return 0;
}
The kernel code is like this
__device__ void func(float temp)
{
float a=2;
temp=temp*a;
return;
}
__global__ void kernel(float* d_in, float* d_out)
{
int tid=(blockIdx.x*blockDim.x)+threadIdx.x;
float temp=d_in[tid];
func(temp);
d_out[tid]=d_out[tid]+temp;
}
But when I print the values of out array, the values are all zero. Here my question is Is the device function being called from the kernel for every thread? How does this execution happen in the GPU?