1

I'm trying to allocate matrix on device, fill it with some number in kernel and then copy it back to host. Problem is that on host only one row seems to be filled.

I got something like this:

9 9 9 9
-1 -1 -1 -1
-1 -1 -1 -1
-1 -1 -1 -1

Here is my code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

void check(cudaError x) {
    fprintf(stderr, "%s\n", cudaGetErrorString(x));
}

void showMatrix2(int* v1, int width, int height) {
    printf("---------------------\n");
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < height; j++) {
            printf("%d ", v1[i * width + j]);
        }
        printf("\n");
    }
}

__global__ void kernel(int* tab,int width, int height, int pitch) {

    int row = threadIdx.x + blockIdx.x * blockDim.x;
    int col = threadIdx.y + blockIdx.y * blockDim.y;

    if (row < width && col < height) {
        tab[col * pitch + row] = 9;
    }
}

int main()
{
    int width = 4;
    int height = 4;

    int* d_tab;
    int* h_tab;

    int realSize = width * height* sizeof(int);

    size_t pitch;
    check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
    h_tab = (int*)malloc(realSize);
    check( cudaMemset(d_tab, 0, realSize) );

    dim3 grid(4, 4);
    dim3 block(4, 4);
    kernel <<<grid, block>>>(d_tab, width, height, pitch);

    check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );

    showMatrix2(h_tab, width, height);
    printf("\nPitch size: %d \n", pitch);
    getchar();
    return 0;
}
Knight
  • 551
  • 1
  • 11
  • 24

1 Answers1

3
  1. Any time you are having trouble with a CUDA code, in addition to doing error checking, run your code with cuda-memcheck. If you had done so, you would have gotten at least a hint as to what is going on, and then you could use techniques like this to continue your own debug. Even if you can't figure it out, the cuda-memcheck output will be useful to others trying to help you.

  2. You have invalid writes in your kernel. There are multiple errors here. To properly access a pitched allocation in kernel code, I strongly recommend studying the example given in the documentation for cudaMallocPitch. In a nutshell, this kind of index generation is just broken:

    tab[col * pitch + row]
    

    Firstly, pitch returned by cudaMallocPitch is a width in bytes. You cannot use it as an adjustment to an index for quantities like int or float (study the documentation). Secondly, the pitch value should ultimately multiply a row index, not a column index.

  3. not related to your problem, but your final printf statement has an incorrect format specifier if you are on a 64-bit platform, it should be %ld (or better, %lu).

Here is a code that has the indexing issue fixed, it seems to work correctly for me:

$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

void check(cudaError x) {
    fprintf(stderr, "%s\n", cudaGetErrorString(x));
}

void showMatrix2(int* v1, int width, int height) {
    printf("---------------------\n");
    for (int i = 0; i < width; i++) {
        for (int j = 0; j < height; j++) {
            printf("%d ", v1[i * width + j]);
        }
        printf("\n");
    }
}

__global__ void kernel(int* tab,int width, int height, int pitch) {

    int row = threadIdx.x + blockIdx.x * blockDim.x;
    int col = threadIdx.y + blockIdx.y * blockDim.y;

    if (row < width && col < height) {
        *( ((int *)(((char *)tab) + (row * pitch))) + col) = 9;
    }
}

int main()
{
    int width = 4;
    int height = 4;

    int* d_tab;
    int* h_tab;

    int realSize = width * height* sizeof(int);

    size_t pitch;
    check( cudaMallocPitch(&d_tab, &pitch, width * sizeof(int), height) );
    h_tab = (int*)malloc(realSize);
    check( cudaMemset(d_tab, 0, realSize) );

    dim3 grid(4, 4);
    dim3 block(4, 4);
    kernel <<<grid, block>>>(d_tab, width, height, pitch);

    check( cudaMemcpy2D(h_tab, width*sizeof(int), d_tab, pitch, width*sizeof(int), height, cudaMemcpyDeviceToHost) );

    showMatrix2(h_tab, width, height);
    printf("\nPitch size: %ld \n", pitch);
    return 0;
}
$ nvcc -arch=sm_61 -o t109 t109.cu
$ cuda-memcheck ./t109
========= CUDA-MEMCHECK
no error
no error
no error
---------------------
9 9 9 9
9 9 9 9
9 9 9 9
9 9 9 9

Pitch size: 512
========= ERROR SUMMARY: 0 errors
$
Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257