I'm trying to allocate matrix on device, fill it with some number in kernel and then copy it back to host. Problem is that on host only one row seems to be filled.
I got something like this:
9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1
Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
void check(cudaError x) {
fprintf(stderr, "%s\n", cudaGetErrorString(x));
}
void showMatrix2(int* v1, int width, int height) {
printf("---------------------\n");
for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
printf("%d ", v1[i * width + j]);
}
printf("\n");
}
}
__global__ void kernel(int* tab,int width, int height, int pitch) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
int col = threadIdx.y + blockIdx.y * blockDim.y;
if (row < width && col < height) {
tab[col * pitch + row] = 9;
}
}
int main()
{
int width = 4;
int height = 4;
int* d_tab;
int* h_tab;
int realSize = width * height* sizeof(int);
size_t pitch;
check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
h_tab = (int*)malloc(realSize);
check( cudaMemset(d_tab, 0, realSize) );
dim3 grid(4, 4);
dim3 block(4, 4);
kernel <<<grid, block>>>(d_tab, width, height, pitch);
check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );
showMatrix2(h_tab, width, height);
printf("\nPitch size: %d \n", pitch);
getchar();
return 0;
}