-3

I am new to CUDA. I have written some simple code, which tries to copy a random initialized matrix to device memory, increments the value of each matrix entry by one, and transfer it back to the host memory.

There is no error while compiling or running the code. But, it seems that the kernel does not launch as the value of matrix entries are the same after launching the kernel.

Any idea what is happening there?

#include <iostream>

using namespace std;

#define SIZE 2

void print_matrix (int size, float *array);
void matrix_initialize(int size, float *array);

__global__ void LU(float * m, int size){
m[threadIdx.y*size + threadIdx.x] ++ ;
}


int main(){
    srand(0);
    //variables
    float *a =  new float[SIZE*SIZE];
    dim3 blockdim(2,2,0);
    dim3 griddim(1,0,0);

    //initialize 
    matrix_initialize(SIZE, a);
    print_matrix (SIZE, a);


    //allocate space on device memory:
    float * Ad;
    int size = SIZE * SIZE;
    cudaMalloc ((void **)&Ad, size);

    //transfer data to device memory:
    cudaMemcpy(Ad , a, size, cudaMemcpyHostToDevice);

    //run the kernel
    LU<<<griddim,blockdim>>>(Ad, SIZE);


    // transfer the data back to the host memory
    cudaMemcpy(a , Ad, size, cudaMemcpyDeviceToHost);

    //test if the kernel runing the kernel has changed the value
    print_matrix (SIZE, a);


    // free device memory :
    cudaFree (Ad);



return 0;
}


void print_matrix (int size, float *array){
    for (int i=0; i < size*size ; i++){

        if(i % size == 0)
        cout << endl;
        cout << array [i] << "  ";

    }

}

void matrix_initialize(int size, float *array){

    for (int i = 0; i< SIZE*SIZE; i++){
            array[i] = rand()/(float) RAND_MAX;
    }
}
Jash lino
  • 1
  • 1
  • Runtime error checking would have been nice: http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api. – void_ptr Nov 23 '14 at 06:37

1 Answers1

2

Unused dimensions should be set to 1 instead of 0:

dim3 blockdim(2, 2, 1);
dim3 griddim(1, 1, 1);

Your code launches 2 x 2 x 0 = 0 blocks, 1 x 0 x 0 = 0 threads each.

Your size calculation is wrong:

int size = SIZE * SIZE * sizeof(float);

Your code does not take array element size into account.

void_ptr
  • 618
  • 5
  • 15
  • 2
    Further, unused dimensions don't need to be specified at all, the constructor for `dim3` has a default value of 1 for any unspecified dimensions – talonmies Nov 23 '14 at 17:01