For all of the reasons suggested in the previous linked answer and elsewhere, this isn't necessarily a good approach for handling 3D arrays. A better approach is to flatten the array and use pointer arithmetic to simulate 3D access.
But just to demonstrate that the previous example doesn't really need to be hard coded dimensions, here's that example modified to show variable (run-time) dimension usage:
#include <iostream>
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
__global__ void doSmth(int*** a, int sz_x, int sz_y, int sz_z) {
for(int i=0; i<sz_z; i++)
for(int j=0; j<sz_y; j++)
for(int k=0; k<sz_x; k++)
a[i][j][k]=i-j+k;
}
int main() {
unsigned sx;
unsigned sy;
unsigned sz;
std::cout << std::endl << "Enter x dimension (3rd subscript): " ;
std::cin >> sx;
std::cout << std::endl << "Enter y dimension (2nd subscript): " ;
std::cin >> sy;
std::cout << std::endl << "Enter z dimension (1st subscript): " ;
std::cin >> sz;
int*** h_c = (int***) malloc(sz*sizeof(int**));
for(int i=0; i<sz; i++) {
h_c[i] = (int**) malloc(sy*sizeof(int*));
for(int j=0; j<sy; j++)
GPUerrchk(cudaMalloc((void**)&h_c[i][j],sx*sizeof(int)));
}
int ***h_c1 = (int ***) malloc(sz*sizeof(int **));
for (int i=0; i<sz; i++){
GPUerrchk(cudaMalloc((void***)&(h_c1[i]), sy*sizeof(int*)));
GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], sy*sizeof(int*), cudaMemcpyHostToDevice));
}
int*** d_c;
GPUerrchk(cudaMalloc((void****)&d_c,sz*sizeof(int**)));
GPUerrchk(cudaMemcpy(d_c,h_c1,sz*sizeof(int**),cudaMemcpyHostToDevice));
doSmth<<<1,1>>>(d_c, sx, sy, sz);
GPUerrchk(cudaPeekAtLastError());
int res[sz][sy][sx];
for(int i=0; i<sz; i++)
for(int j=0; j<sy; j++)
GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],sx*sizeof(int),cudaMemcpyDeviceToHost));
std::cout << std::endl;
for(int i=0; i<sz; i++)
for(int j=0; j<sy; j++)
for(int k=0; k<sx; k++)
printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);
}
I have modified the data stored by the kernel to i-j+k
instead of i+j+k
. Also, I have created a [z][y][x]
order to the subscripts, because this will suggest the usage of thread index computed arrangements such as [threadIdx.z][threadIdx.y][threadIdx.x]
which will be most conducive to coalesced access. However, this type of multiple-subscripted array in the kernel will still tend to be inefficient due to pointer-chasing to resolve the final location of the data.