2

I am new to CUDA programming. I just started a simple vector-Add program, but its output is abnormal.

My code is attached below:

__global__
void vecAddKernel(float* A, float* B, float* C, int n){
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if(i < n){
        C[i] = A[i] + B[i];
    }
}


void vecAdd(float* A, float* B, float* C, int n){
    int size = n * sizeof(float);
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_B, size);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);
    cudaMalloc((void**)&d_C, size);
    vecAddKernel<<<ceil(n/256.0), 256>>>(d_A, d_B, d_C, n);
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}

int main(int argc, const char *argv[])
{
    float A[10] = {1,2,3,4,5,6,7,8,9,10};
    float B[10] = {10,20,30,40,50,60,70,80,90,100};
    float* C;
    C = (float*)malloc(10);
    memset(C, 0, 10*sizeof(float));
    vecAdd(A, B, C, 10);
    for (int i = 0; i < 10; i++) {
        printf("%f, ", C[i]);
    }
    printf("\n");
    return 0;
}

My CUDA is cuda-6.5 and gcc is gcc49. It output as follow:

-9087809423414278337673035776.000000, 1.836612, -28609169409429209088.000000, 1.795911, 0.000000, 0.777735, -125923819520.000000, 1.807979, 585061501691794292736.000000, 1.826568

Is there a way to debug this program? What happens to it? Any help is appreciated in advance.

Jake0x32
  • 1,402
  • 2
  • 11
  • 18

1 Answers1

4

I don't have environment to test right now, but I think the problem is in your main function:

C = (float*)malloc(10);

should be

C = (float*)malloc(10 * sizeof(float));

Not sure if there are any other problem.

To debug a CUDA program, I usually define a error check function and wrapper macro like this:

#define checkCudaErrors(err) { __checkCudaErrors((err), __FILE__, __LINE__); }
inline void __checkCudaErrors(cudaError_t err, const char *file, int line)
{
   if (err != cudaSuccess) 
   {
      fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, (int)err, cudaGetErrorString(err));
      exit(-1);
   }
}

And wrap each CUDA API call with checkCudaErrors macro:

checkCudaErrors( cudaMalloc((void**)&d_A, size) );

I'm no expert using CUDA, but hope this helped :-)

EDIT:

Also refer this: What is the canonical way to check for errors using the CUDA runtime API?

And it's better not casting the result of malloc in C:

C = malloc(10 * sizeof(float));
Community
  • 1
  • 1
justmscs
  • 756
  • 4
  • 12
  • 1
    Good answer, and extra credit for the suggestions on error checking etc. Note however that [casting the result of malloc in C should be discouraged](http://stackoverflow.com/questions/605845/do-i-cast-the-result-of-malloc). – Paul R Dec 25 '14 at 10:25
  • 3
    CUDA uses a C++ based toolchain, and thus casting the result of `malloc()` to a pointer of the appropriate type is necessary. – njuffa Dec 25 '14 at 19:39