0

Am new to cuda. The above code is a cuda program am working on. when executing the for loop it shows that the lat.exe has stopped working. But when i decrease the for loop from 5000 to 1000 it works perfectly fine. How do i make it work with 5000 because that's the number i will be working with.

int main() {

int *a, *b, *c;
int *d_a, *d_b, *d_c;


a = (int *)malloc(SIZE*sizeof(int));
b = (int *)malloc(SIZE*sizeof(int));
c = (int *)malloc(SIZE*sizeof(int));

cudaMalloc( &d_a, SIZE*sizeof(int));
cudaMalloc( &d_b, SIZE*sizeof(int));
cudaMalloc( &d_c, SIZE*sizeof(int));


for( int i = 0; i < SIZE; i++ )
{
    a[i] =i;
    b[i] =i;
    c[i] =0;
}

cudaMemcpy( d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice );
cudaMemcpy( d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice );


InitialAdd<<< 3 , SIZE >>>( d_a, d_b, d_c, SIZE);

cudaMemcpy( c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost );

for( int i = 0; i < 5000; i++)
    printf("c[%d] = %d\n", i, c[i]);

free(a);
free(b);
free(c);

cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);

return 0;

}

1 Answers1

3

you can not create block with 5000 threads. that's your problem. That's why your code is working with size = 1000 and not working with size = 5000. Block is up to 1024 threads (generally).

user2076694
  • 806
  • 1
  • 6
  • 10