Managing a 2D CUDA Array

Question

I'm trying to pass a 2d array to a kernel so that each thread can access index = threadIdx.x + (blockIdx.x * blockDim.x) but I'm having trouble figuring out just how to do this and how to copy the data back over.

size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
  for(block_size)
    h_array[block][thread] should be 1

__global__ void kernel(int *array, int pitch) {
  int *row = (int*)((char*)array + blockIdx.x * pitch);
  row[threadIdx.x] = 1;
  return;
}

What am I doing wrong, here?

Why are you casting the array to a (char*) ? That will result in a bad pointer arithmetic — LarryPel, Aug 29 '13 at 16:41
That's how it was described in these two questions: http://stackoverflow.com/questions/1047369/allocate-2d-array-on-device-memory-in-cuda http://stackoverflow.com/questions/5029920/how-to-use-2d-arrays-in-cuda — user1743798, Aug 29 '13 at 16:42
@LarryPel: No it won't. The pitch is in bytes, and a pointer to a byte sized type is required to perform the pointer arithmetic correctly. — talonmies, Aug 29 '13 at 16:48
Please provide a complete, compilable program. Don't make us play 20 questions about what is "block_size", what is "grid_size" etc. — Robert Crovella, Aug 29 '13 at 16:57

score 1 · Accepted Answer · answered Aug 29 '13 at 17:10

1

Your cudaMemset2D is accesing to a bigger memory space that you previously allocated with cudaMallocPitch Also your cudaMemcpy2D is copying a little portion of that memory.

You should use the function in the following way:

cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);

answered Aug 29 '13 at 17:10

hidrargyro

257
1
7

In addition, if you had done [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) your `cudaMemset2D` (at least) would have thrown an error. – Robert Crovella Aug 29 '13 at 17:20
Also, presumably your `h_array` is not pitched. Therefore the `pitch` parameter you are passing for it in cudaMemcpy2D is incorrect. You should probably be passing `block_size*sizeof(int)` (or something like that) for the pitch of `h_array`. – Robert Crovella Aug 29 '13 at 17:25

score 1 · Answer 2 · edited May 19 '22 at 17:25

Here's a complete code that passes a basic test with the errors mentioned by @hidrargyro fixed:

$ cat t236.cu
#include <stdio.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void kernel(int *array, int pitch) {
  int *row = (int*)((char*)array + blockIdx.x * pitch);
  row[threadIdx.x] = 1;
  return;
}

int main(){

int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");

size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");

cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
  for(int j = 0; j<block_size; j++)
    if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}

$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$

This is like CUDA_SAFE_CALL(), I always wonder for the strange loop with while(0) — hidrargyro, Aug 29 '13 at 19:23

Managing a 2D CUDA Array

2 Answers2