0

I had followed example of Using cudaMemcpy3D to transfer *** pointer Yet my task is to copy the 3d subsection of the device global memory array to device global memory array for example:

Nx =10;
Ny=10;
Nz = 10;
struct cudaPitchedPtr sourceTensor;
cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(int), Ny, Nz))
... // here I am populating sourceTensor with some Data
NxTarget = 5;
NyTarget = 5;
NzTarget = 5;
struct cudaPitchedPtr targetTensor;
cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(int), NyTarget, NzTarget))

// here I get lost ...
cudaMemcpy3DParms cpy = { 0 };
cpy.srcPtr = make_cudaPitchedPtr(sourceTensor[0][0], Nx * sizeof(int), Nx, Ny); // How to make it start in chosen location like for example 1,2,3
cpy.dstPtr = targetTensor;
cpy.extent = make_cudaExtent(NxTarget * sizeof(int), NyTarget , NzTarget );
cpy.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&cpy);

So in above I am looking for a way to copy from sourceTensor to target tensor all the data where

x indices are in range (1,6)

y indices are in range (2,7)

z indices are in range (3,8)

So only subsection of the source array but I do not know How to define make_cudaPitchedPtr and make_cudaExtent properly, in order to achieve my goal.

  • Is your overall volume size just that range (probably not, it is 10x10x10?) or the subsection you want to copy (probably yes)? Is it performance critical not to copy the whole volume? You can use several copy commands on the host in a loop or only copy the subsection on the device into another global memory area and then transfer this area to the CPU. You can also write directly into pinned host memory from the GPU. I do not think there is a ready-to-use command to do, what you want. – Sebastian Jan 31 '22 at 13:53
  • Thanks for response ! - source is 10x10x10 and i want just part of voxels that are in it it (10x10x10 is just the example in reality it is far bigger ) – Jakub Mitura Jan 31 '22 at 15:37
  • Probably `cudaMemcpy3D` can do it: Have you tried to set `cpy.srcPos` and `cpy.dstPos` and `make_cudaPos`? – Sebastian Jan 31 '22 at 16:33

1 Answers1

2

The srcPos parameter in your cudaMemcpy3DParams should make this pretty easy. Here is an example:

$ cat t1957.cu
#include <cstdio>

typedef int it;  // index type
typedef int dt;  // data type

__global__ void populate_kernel(struct cudaPitchedPtr sourceTensor, it Nx, it Ny, it Nz) {

  for (it z = 0; z < Nz; z++)
    for (it y = 0; y < Ny; y++)
      for (it x = 0; x < Nx; x++) {
        char *ptr = (char *)sourceTensor.ptr + sourceTensor.pitch*(z*Ny+y);
        ((dt *)ptr)[x] = z*100+y*10+x;
        }
};

__global__ void verify_kernel(struct cudaPitchedPtr targetTensor, it NxTarget, it NyTarget, it NzTarget, it NxOffset, it NyOffset, it NzOffset) {

  if (((dt *)targetTensor.ptr)[0] != 321) {
    printf("%d\n", ((dt *)targetTensor.ptr)[0]);
    }

};


int main(){

  it Nx =10;
  it Ny=10;
  it Nz = 10;
  struct cudaPitchedPtr sourceTensor;
  cudaMalloc3D(&sourceTensor, make_cudaExtent(Nx * sizeof(dt), Ny, Nz));
  populate_kernel<<<1,1>>>(sourceTensor, Nx, Ny, Nz);
  it NxTarget = 5;
  it NyTarget = 5;
  it NzTarget = 5;
  struct cudaPitchedPtr targetTensor;
  cudaMalloc3D(&targetTensor, make_cudaExtent(NxTarget* sizeof(dt), NyTarget, NzTarget));
  cudaMemcpy3DParms cpy = { 0 };
  it NxOffset = 1;
  it NyOffset = 2;
  it NzOffset = 3;
  cpy.srcPos = make_cudaPos(NxOffset*sizeof(dt), NyOffset, NzOffset);
  cpy.srcPtr = sourceTensor;
  cpy.dstPtr = targetTensor;
  cpy.extent = make_cudaExtent(NxTarget * sizeof(dt), NyTarget , NzTarget );
  cpy.kind = cudaMemcpyDeviceToDevice;
  cudaMemcpy3D(&cpy);
  verify_kernel<<<1,1>>>(targetTensor, NxTarget, NyTarget, NzTarget, NxOffset, NyOffset, NzOffset);
  cudaDeviceSynchronize();
}
$ nvcc -o t1957 t1957.cu
$ cuda-memcheck ./t1957
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Note that when neither source nor destination are specified as cudaArray types, then the element size is always assumed to be unsigned char (ie. 1 byte).

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257