-1

I'm trying to use dynamic 3D array argument in kernel function in cuda but I can't do well.

__global__ void kernel ( 3D array pointer )
{
// do something
}

int main()
{
    const int NUM_OF_ARRAY;
    const int ROW;
    const int CAL;

    int arr[NUM_OF_ARRAY][ROW][CAL]; 
    // Maybe I should use cudaMalloc3D or cudaMalloc3DArray

    dim3 grid( , , ,);
    dim3 block( , , , );

    kernel <<< grid, block >>> ( ? );
 }

I saw Robert's answer for sending 3d array to CUDA kernel but I think my case is little different.

If array's row and cal are determined at runtime, how can I allocate that memory in cuda and give that's pointer to kernel function?

I tried to use cudaMalloc3D or cudaMalloc3DArray but I could't well because I have never used before.

Can anyone shows simple example using dynamic 3D array arguments?

It will be helpful for me. Thanks.

Community
  • 1
  • 1
Umbrella
  • 475
  • 3
  • 9
  • 19
  • You should deal with 3D CUDA dynamic arrays in the same way as you would deal with 3D C/C++ dynamic arrays. The only thing to take care of is access coalescence. Take a look at [this](http://pastebin.com/QmGiMqET) example. – Vitality Aug 01 '14 at 05:36
  • Possible duplicate of [3D indices access in CUDA for nonlinear diffusion](http://stackoverflow.com/questions/15175967/3d-indices-access-in-cuda-for-nonlinear-diffusion) – Vitality Aug 01 '14 at 05:40
  • `cudaMalloc3D` is not the right thing to use. The answer you linked is certainly one way to do it. The second example I gave, even though it appears to be hard-coded for 2,2,2, is actually workable for the runtime-determined dimensions. If you really want a 3d (triple-subscript) runtime-determined dimensions array, this question is a duplicate of that one. – Robert Crovella Aug 01 '14 at 09:51

1 Answers1

2

For all of the reasons suggested in the previous linked answer and elsewhere, this isn't necessarily a good approach for handling 3D arrays. A better approach is to flatten the array and use pointer arithmetic to simulate 3D access.

But just to demonstrate that the previous example doesn't really need to be hard coded dimensions, here's that example modified to show variable (run-time) dimension usage:

#include <iostream>

inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
    if (code != 0) {
        fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
        if (Abort) exit(code);
    }
}

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }



  __global__ void doSmth(int*** a, int sz_x, int sz_y, int sz_z) {
    for(int i=0; i<sz_z; i++)
     for(int j=0; j<sz_y; j++)
      for(int k=0; k<sz_x; k++)
       a[i][j][k]=i-j+k;
 }
 int main() {

  unsigned sx;
  unsigned sy;
  unsigned sz;
  std::cout << std::endl << "Enter x dimension (3rd subscript): " ;
  std::cin >> sx;
  std::cout << std::endl << "Enter y dimension (2nd subscript): " ;
  std::cin >> sy;
  std::cout << std::endl << "Enter z dimension (1st subscript): " ;
  std::cin >> sz;

  int*** h_c = (int***) malloc(sz*sizeof(int**));
  for(int i=0; i<sz; i++) {
   h_c[i] = (int**) malloc(sy*sizeof(int*));
   for(int j=0; j<sy; j++)
    GPUerrchk(cudaMalloc((void**)&h_c[i][j],sx*sizeof(int)));
  }
  int ***h_c1 = (int ***) malloc(sz*sizeof(int **));
  for (int i=0; i<sz; i++){
    GPUerrchk(cudaMalloc((void***)&(h_c1[i]), sy*sizeof(int*)));
    GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], sy*sizeof(int*), cudaMemcpyHostToDevice));
    }
  int*** d_c;
  GPUerrchk(cudaMalloc((void****)&d_c,sz*sizeof(int**)));
  GPUerrchk(cudaMemcpy(d_c,h_c1,sz*sizeof(int**),cudaMemcpyHostToDevice));
  doSmth<<<1,1>>>(d_c, sx, sy, sz);
  GPUerrchk(cudaPeekAtLastError());
  int res[sz][sy][sx];
  for(int i=0; i<sz; i++)
   for(int j=0; j<sy; j++)
    GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],sx*sizeof(int),cudaMemcpyDeviceToHost));
  std::cout << std::endl;
  for(int i=0; i<sz; i++)
   for(int j=0; j<sy; j++)
    for(int k=0; k<sx; k++)
     printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);
 }

I have modified the data stored by the kernel to i-j+k instead of i+j+k. Also, I have created a [z][y][x] order to the subscripts, because this will suggest the usage of thread index computed arrangements such as [threadIdx.z][threadIdx.y][threadIdx.x] which will be most conducive to coalesced access. However, this type of multiple-subscripted array in the kernel will still tend to be inefficient due to pointer-chasing to resolve the final location of the data.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257