I'm passing a matrix from host to device and trying to store it on the gpu memory. to test if it really works I'm copying the first row from the matrix in the gpu to the host. but after it returns from the wrapper it prints garbage values.
C file:
int col;
srand(time(NULL));
matrix = (int**) malloc(10*sizeof(int*));
for(int j = 0; j < 10; j++)
{
col = 3 + (rand() % 7);
matrix[j] = (int*) malloc(sizeof(int)*col);
matrix[j][0] = col-1;
for(int i = 1; i < col; i++)
{
matrix[j][i] = i;
}
}
int first_row[10];
int rows = 10;
pass_matrix_kernel_wrapper(matrix, &rows);
foo_wrapper(first_row); // get the first row of the matrix from the gpu
for(int i = 0; i < matrix[0][0]; i++)
{
printf("%d, ", first_row[i]);
}
Cuda file:
__shared__ int **gpu_matrix;
__shared__ int gpu_rows;
void pass_matrix_kernel_wrapper(int** matrix, int* rows)
{
cudaMalloc((void***)(&gpu_matrix), sizeof(int*) * (*rows));
for (int i = 0; i < *rows; i++)
{
int cols = matrix[i][0] + 1;
int* temp;
cudaMalloc( (void**) &(temp), sizeof(int) * cols); // allocate for 1 int in each int pointer
cudaMemcpy(temp, matrix[i], sizeof(int) * cols, cudaMemcpyHostToDevice); // copy data
cudaMemcpy(gpu_matrix+i, &temp, sizeof(int*) * cols, cudaMemcpyHostToDevice);
}
}
void foo_wrapper(int* back)
{
int* temp;
cudaMalloc( (void**) &(temp), sizeof(int) * 11); // allocate for 1 int in each int pointer
test_kernel<<<1,1>>>(temp); // just checking if it works
cudaDeviceSynchronize();
int size = temp[0] + 1;
cudaMemcpy(back, &temp, sizeof(int) * size, cudaMemcpyDeviceToHost);
}
__global__ void test_kernel(int* back)
{
for(int i = 0;i < gpu_matrix[0][0] + 1;i++) // gpu_matrix[0][0] stores num of cols in row
{
back[i] = gpu_matrix[0][i];
}
}