wrong results in cuda

Question

I try to code a simple example with cuda C, I follow a screencast about this but I have wrong result

this is an the example :

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE    1024

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }

}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");

    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
    }
    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);

    free(a);
    free(b);
    free(c);
    enter code here
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

the result is :

c[0]=0
c[1]=0
c[2]=0
c[3]=0
c[4]=0
c[5]=0
c[6]=0
c[7]=0
c[8]=0
c[9]=0

but I expect this result :

c[0]=0
c[1]=2
c[2]=4
c[3]=6
c[4]=8
c[5]=10
c[6]=12
c[7]=14
c[8]=16
c[9]=18

please any one can help about this !

try inserting a `cudaDeviceSynchronize()` call after launching the kernel. — 3Dave, Nov 06 '14 at 23:12
Doing [proper error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) will give you a clue about your problem. What is your GPU compute capability? GPUs with CC < 2.0 have a limit of 512 threads per block. — pQB, Nov 07 '14 at 09:59
@DavidLively: That will have no effect. cudaMemcpy is a blocking call. — talonmies, Nov 07 '14 at 13:21
You have commented (in one of the answers) that you have a GT210. The compute capability of that GPU is 1.2. Review the specification of the architecture to find the maximum number of threads per block. That said, what happens in your example if change `SIZE` to `512`? — pQB, Nov 07 '14 at 13:38
@talonmies I must have `cudaMemcpyAsync()` on the brain. I see it everywhere I look. :) — 3Dave, Nov 07 '14 at 14:48
One other thing: you won't get an accurate error check this way. Add the line `cudaStatus = cudaGetLastError();` or`cudaStatus = cudaGetLastError();` right after your launch. You should (hopefully) get a better error message. — 3Dave, Nov 07 '14 at 14:55
Oh, my apologies. Max threads per block for CC 1.2 is 256!. Try again with SIZE=`256`. Let us know if that works. — pQB, Nov 07 '14 at 15:24
Many cross comments between the question and the Answers. Please, could you update the question with the GPU model, the operating system, the software you are using to code and compile, and the command used to compile the code? — pQB, Nov 07 '14 at 15:44

score 2 · Accepted Answer · edited May 23 '17 at 12:21

I did some wrong comments, so I will try fix my errors and give a correct answer here. First all, please, attend the comments related to proper CUDA error checking.

Second, the Maximum Thread Block Size for a GT210 (CC 1.2) is 512, not 256 as I commented in a moment of confusion.

That said, You should get the following error by doing the mentioned error checking:

GPUassert: invalid device function

In this case, this error indicates the architecture for which you have compiled your code is higher than the one you are using to run the example. You are compiling the example for devices of compute capability = 2.0 or above (as you commented), but then you execute the code in your GT210 which has a compute capability = 1.2.

So, first, re-compile your example for the corresponding architecture. Change the

-gencode=arch=compute_20 TO -gencode=arch=compute_12

Once you have successfully compiled the example for your architecture, you will get the following error (because you ALREADY are doing proper error checking ;)

GPUassert: invalid configuration argument

In this case, the error indicates that you are using more resources than the ones available for your architecture (compute capability 1.2) because you are trying to launch blocks of SIZE = 1024 but the Maximum Thread Block Size is 512, that is, you can not configure a block with more than 512 threads.

So, adjust the SIZE to 512 and everything should work as expected. Below is your example, doing proper CUDA error checking.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<windows.h>
#define SIZE    1024

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void VectorAdd(int *a, int *b, int *c, int n)
{
    int i = threadIdx.x;

    if (i < n){
        c[i] = a[i] + b[i];
    }
}

int main()
{
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    cudaError_t cudaStatus;
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU         installed?");
    }
    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    c = (int *)malloc(SIZE*sizeof(int));

    gpuErrchk( cudaMalloc(&d_a, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_b, SIZE*sizeof(int)) );
    gpuErrchk( cudaMalloc(&d_c, SIZE*sizeof(int)) );

    for (int i = 0; i < SIZE; i++)
    {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    gpuErrchk( cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice) );

    VectorAdd<<< 1, SIZE >>>(d_a, d_b, d_c, SIZE);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );

    gpuErrchk( cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost) );

    for (int i = 0; i < 10; ++i)
        printf("c[%d] = %d\n", i, c[i]);

    free(a);
    free(b);
    free(c);
    // enter code here 
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

wrong results in cuda

1 Answers1

Linked