0

I'm passing a matrix from host to device and trying to store it on the gpu memory. to test if it really works I'm copying the first row from the matrix in the gpu to the host. but after it returns from the wrapper it prints garbage values.

C file:

int col;
srand(time(NULL));
matrix = (int**) malloc(10*sizeof(int*));

for(int j = 0; j < 10; j++)
{
    col = 3 + (rand() % 7);
    matrix[j] = (int*) malloc(sizeof(int)*col);
    matrix[j][0] = col-1;
    for(int i = 1; i < col; i++)
    {
       matrix[j][i] = i;
    }
}

int first_row[10];
int rows = 10;
pass_matrix_kernel_wrapper(matrix, &rows); 

foo_wrapper(first_row); // get the first row of the matrix from the gpu

for(int i = 0; i < matrix[0][0]; i++)
{
    printf("%d, ", first_row[i]);
}

Cuda file:

__shared__ int **gpu_matrix;
__shared__ int gpu_rows;


void pass_matrix_kernel_wrapper(int** matrix, int* rows)
{
    cudaMalloc((void***)(&gpu_matrix), sizeof(int*) * (*rows));
    for (int i = 0; i < *rows; i++)
    {
    int cols = matrix[i][0] + 1;
        int* temp;

        cudaMalloc( (void**)  &(temp), sizeof(int) * cols); // allocate for 1 int in    each int pointer
        cudaMemcpy(temp, matrix[i], sizeof(int) * cols, cudaMemcpyHostToDevice); // copy data
        cudaMemcpy(gpu_matrix+i, &temp, sizeof(int*) * cols, cudaMemcpyHostToDevice);
    }
}

void foo_wrapper(int* back)
{
    int* temp;
    cudaMalloc( (void**)  &(temp), sizeof(int) * 11); // allocate for 1 int in each int pointer

    test_kernel<<<1,1>>>(temp); // just checking if it works
    cudaDeviceSynchronize();

    int size = temp[0] + 1;
    cudaMemcpy(back, &temp, sizeof(int) * size, cudaMemcpyDeviceToHost);
}

__global__ void test_kernel(int* back)
{
    for(int i = 0;i < gpu_matrix[0][0] + 1;i++) // gpu_matrix[0][0] stores num of cols in row
    {
        back[i] = gpu_matrix[0][i];
    }
}
Elior
  • 3,178
  • 6
  • 37
  • 67
  • 2
    How about adding [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to your code. And struggling with 2D (`**`) matrices is fine if that's what you want, but most folks find it easier to flatten things to 1D (`*`) and do pointer/index arithmetic. There are plenty of examples of 2D matrix copy on the CUDA tag, you might take a look at some of them. – Robert Crovella Mar 29 '14 at 22:30
  • 1
    You use `gpu_matrix` as an input to Your kernel? Shouldn't it be a global not shared memory variable then? – Dori Mar 30 '14 at 13:34

0 Answers0