Can anyone explain how this CUDA kernel executes?

Question

I am a beginner in CUDA and I tried this example code.

int main()
{
    int i;
    cudaError_t cudastatus;
    float in[9]={1,2,3,4,5,6,7,8,9};
    float* h_in=in;
    float* d_in={0};
    cudaMalloc((void**)&d_in,9*sizeof(float));
    cudaCheckErrors("malloc failed");
    cudastatus=cudaMemcpy(d_in,h_in,9*sizeof(float),cudaMemcpyHostToDevice);
    cudaCheckErrors("memcpyh2d failed");
    float* d_out={0};
    cudaMalloc((void**)&d_out,9*sizeof(float));
    float* out[9]={0};
    kernel<<<3,3>>>(d_in,d_out);
    cudastatus=cudaDeviceSynchronize();
    cudaError_t cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
         fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
         getchar();
    }
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching             Kernel!\n", cudaStatus);
    getchar();
      }
    cudastatus=cudaMemcpy(out,d_out,9*sizeof(float),cudaMemcpyDeviceToHost);
    cudaCheckErrors("memcpyd2h failed");
    for(i=0;i<9;i++)
    {
        printf("%f\n",out[i]);
    }
    getchar();
    return 0;

}

The kernel code is like this

__device__ void func(float temp)
{
     float a=2;
     temp=temp*a;
     return;
}
__global__ void kernel(float* d_in, float* d_out)
{
     int tid=(blockIdx.x*blockDim.x)+threadIdx.x;
     float temp=d_in[tid];
     func(temp);
     d_out[tid]=d_out[tid]+temp;
}

But when I print the values of out array, the values are all zero. Here my question is Is the device function being called from the kernel for every thread? How does this execution happen in the GPU?

score 2 · Accepted Answer · edited May 23 '17 at 12:04

Any time you're having trouble with a CUDA code, you should do proper cuda error checking and run your code with cuda-memcheck. It may be that the kernel is not executing at all due to some other error in your program, or a problem with the CUDA setup on your machine.

This function:

__device__ void func(float temp)
{
     float a=2;
     temp=temp*a;
     return;
}

cannot do anything useful, for reasons that have nothing to do with CUDA.

In C/C++, when we pass a parameter to a function like this:

void func(float temp)

the temp parameter is passed by value. This means that a copy is made of temp, and given to the function to use. This copy is separate from whatever temp was in the calling environment. Any modifications to temp will not show up in the calling environment. Therefore the function does nothing useful. Even though it appears to multiply temp by a (2 in this case), that modified value is lost when the function returns.

To avoid this, you should learn more about C/C++ programming and consider passing the value by pointer (simulated pass-by-reference):

__device__ void func(float *temp)

or by reference:

__device__ void func(float &temp)

This line of code is also not what you intended:

float* out[9]={0};

I think it should be:

float out[9]={0};

You want an array of float quantities, not an array of float pointers.

Here is a "fixed" version of your code:

#include <stdio.h>

__device__ void func(float &temp)
{
     float a=2;
     temp=temp*a;
     return;
}
__global__ void kernel(float* d_in, float* d_out)
{
     int tid=(blockIdx.x*blockDim.x)+threadIdx.x;
     float temp=d_in[tid];
     func(temp);
     d_out[tid]=d_out[tid]+temp;
}

int main()
{
    int i;
    cudaError_t cudastatus;
    float in[9]={1,2,3,4,5,6,7,8,9};
    float* h_in=in;
    float* d_in={0};
    cudastatus=cudaMalloc((void**)&d_in,9*sizeof(float));
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm0 fail %s\n", cudaGetErrorString(cudastatus));
    }
    cudastatus=cudaMemcpy(d_in,h_in,9*sizeof(float),cudaMemcpyHostToDevice);
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm1 fail %s\n", cudaGetErrorString(cudastatus));
    }
    float* d_out={0};
    cudastatus=cudaMalloc((void**)&d_out,9*sizeof(float));
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm2 fail %s\n", cudaGetErrorString(cudastatus));
    }
    cudaMemset(d_out, 0, 9*sizeof(float));
    float out[9]={0};
    kernel<<<3,3>>>(d_in,d_out);
    cudaDeviceSynchronize();
    cudastatus = cudaGetLastError();
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "Kernel launch failed: %s\n", cudaGetErrorString(cudastatus));
    }
    cudastatus=cudaMemcpy(out,d_out,9*sizeof(float),cudaMemcpyDeviceToHost);
    if (cudastatus != cudaSuccess) {
         fprintf(stderr, "cm3 fail %s\n", cudaGetErrorString(cudastatus));
    }
    for(i=0;i<9;i++)
    {
        printf("%f\n",out[i]);
    }
    return 0;
}

Thanks for the explanation! It helped me a lot. Yes, the example function here does nothing useful. Just wrote it to check the working of code. Thanks again! — learning_always, Jul 30 '14 at 12:06
I just noticed that you are not initializing `d_out` anywhere in your code. So in this line in your kernel: ` d_out[tid]=d_out[tid]+temp;` you are using the value without initializing it. So that is also broken and should be fixed. I added a cudaMemset line to address that. — Robert Crovella, Jul 30 '14 at 12:21
That's the catch! I was just wondering how to set that right. Thanks a lot once again! — learning_always, Jul 30 '14 at 12:24

Can anyone explain how this CUDA kernel executes?

1 Answers1