2

I am trying to use cudaMemcpy3D to transfer dynamically allocated 3d matrix (tensor). Tensor is allocated as contiguous block of memory (see code below). I tried various combinations of cudaExtent and cudaMemcpy3DParms, however the order of elements gets mixed up. I created the following example to demonstrate the issue:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Nx;
            tensorrow = (int *)(tensorslice + i * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[j]);
         }
      }
   }   
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nx, Ny, Nz);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);

   // ... clean-up
}

Output for host variable (h_tensor) and device (d_tensor) differ, looking like

h_tensor[0][0][0] = 0
h_tensor[0][0][1] = 1
h_tensor[0][0][2] = 2
h_tensor[0][0][3] = 3
h_tensor[0][1][0] = 4
h_tensor[0][1][1] = 5
h_tensor[0][1][2] = 6
...

d_tensor[0][0][0] = 0
d_tensor[0][0][1] = 12
d_tensor[0][0][2] = 24
d_tensor[0][0][3] = 36
d_tensor[0][1][0] = 1
d_tensor[0][1][1] = 13
d_tensor[0][1][2] = 25
...

What am I doing wrong? What would be the correct way to use cudaMemcpy3D?

user3452579
  • 413
  • 4
  • 14
  • I have successfully used `cudaMemcpy2D` for 2D matrices allocated in the similar way. I assume that same approach could be extended for 3D allocations, just have to figure out correct parameters. – user3452579 Apr 26 '14 at 14:37
  • Sorry, I misread. You are doing a flat allocation. – Robert Crovella Apr 26 '14 at 14:47

1 Answers1

6
  1. Any time you are having trouble with a cuda code, it's a good idea to do proper cuda error checking. The code you have posted here, at least, does not run correctly for me - the cudaMemcpy3D line throws an error. This is due to item 2 below. (I suspect the code you used to generate the output was not identical to the code you have shown here, but that's just a guess.)
  2. Your usage of make_cudaPitchedPtr is not correct:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Ny, Nz);
    

    review the API documentation. Making a CUDA pitched pointer this way is no different between 2D and 3D. So it makes no sense to pass 3 different dimensions as you are doing. Instead do this:

    cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
    
  3. The remaining issues I found I attribute to incorrect understanding of 3 dimensions in C. The last subscript on a multiply-subscripted array is the rapidly varying dimension, i.e. it is the one where adjacent values in memory occupy adjacent index values. Your usage of Z in the 3rd dimension is confusing to me due to this. Your host allocation was using Nx in the first subscript place, but your device indexing didn't match. There are obviously multiple ways to handle this. If you don't like my arrangement, you can change it, but the host and device indexing must match.

Anyway, the following code modifications worked for me:

#include <stdio.h>

int ***alloc_tensor(int Nx, int Ny, int Nz) {
   int i, j;
   int ***tensor;

   tensor = (int ***) malloc((size_t) (Nx * sizeof(int **)));
   tensor[0] = (int **) malloc((size_t) (Nx * Ny * sizeof(int *)));
   tensor[0][0] = (int *) malloc((size_t) (Nx * Ny * Nz * sizeof(int)));

   for(j = 1; j < Ny; j++)
      tensor[0][j] = tensor[0][j-1] + Nz;
   for(i = 1; i < Nx; i++) {
      tensor[i] = tensor[i - 1] + Ny;
      tensor[i][0] = tensor[i - 1][0] + Ny * Nz;
      for(j = 1; j < Ny; j++)
         tensor[i][j] = tensor[i][j - 1] + Nz;
   }

   return tensor;
}

__global__ void kernel(cudaPitchedPtr tensor, int Nx, int Ny, int Nz) {
   int i, j, k;
   char *tensorslice;
   int *tensorrow;

   for (i = 0; i < Nx; i++) {
      for (j = 0; j < Ny; j++) {
         for (k = 0; k < Nz; k++) {
            tensorslice = ((char *)tensor.ptr) + k * tensor.pitch * Ny;
            tensorrow = (int *)(tensorslice + j * tensor.pitch);
            printf("d_tensor[%d][%d][%d] = %d\n", i, j, k, tensorrow[i]);
         }
      }
   }
}

int main() {
   int i, j, k, value = 0;
   int Nx = 2, Ny = 6, Nz = 4;

   int ***h_tensor;
   struct cudaPitchedPtr d_tensor;

   h_tensor = alloc_tensor(Nz, Ny, Nx);
   cudaMalloc3D(&d_tensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz));

   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            h_tensor[k][j][i] = value++;
            //printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[i][j][k]);
         }
      }
   }
   for(i = 0; i < Nx; i++) {
      for(j = 0; j < Ny; j++) {
         for(k = 0; k < Nz; k++) {
            //h_tensor[i][j][k] = value++;
            printf("h_tensor[%d][%d][%d] = %d\n", i, j, k, h_tensor[k][j][i]);
         }
      }
   }

   cudaMemcpy3DParms cpy = { 0 };
   cpy.srcPtr = make_cudaPitchedPtr(h_tensor[0][0], Nx * sizeof(int), Nx, Ny);
   cpy.dstPtr = d_tensor;
   cpy.extent = make_cudaExtent(Nx * sizeof(int), Ny, Nz);
   cpy.kind = cudaMemcpyHostToDevice;

   cudaMemcpy3D(&cpy);

   kernel<<<1, 1>>>(d_tensor, Nx, Ny, Nz);
   cudaDeviceSynchronize();
   // ... clean-up
}
Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • Thanks, I'll try it out and report my findings. I have tried many combinations of the order of `Nx`, `Ny` and `Nz` parameters, code I posted was just one of them. Problem arose when I was porting some code to CUDA, and in original code it was `alloc_tensor(Nx, Ny, Nz)`. Therefore I am looking for whatever modifications I need to make just to have that constraint satisfied. I should have said that more clearly. – user3452579 Apr 26 '14 at 18:40
  • Ok, I got around to test this. I just swapped `Nx` and `Nz`, and got the solution I was looking for. Thanks Robert. – user3452579 Apr 27 '14 at 11:06