2

I am trying to do a simple test with cudaMemcpy3D using CUDA 5.5. I have searched around and found different examples and read the appropriate Runtime API documentation, but can't figure out what I am doing wrong. The following code compiles just fine, but when I try to run it, I get a segmentation fault on the cudaMemcpy3D call. I tried running this with cuda-gdb, but can't get any useful information out of it to tell me what is wrong (maybe because I'm not that familiar with gdb/cuda-gdb usage). Any help in figuring out where my error is would be greatly appreciated.

#include <cstdio>
#include <cuda_runtime.h>

int main() {
  static const size_t NX = 60; 
  static const size_t NY = 60; 
  static const size_t NZ = 60; 

  float* h_data = new float[NX * NY * NZ];
  for(unsigned int i = 0; i < NX * NY * NZ; ++i) {
    h_data[i] = static_cast<float>(i);
  }

  float* d_data = 0;

  cudaPitchedPtr dstPtr = make_cudaPitchedPtr((void**)&d_data, NX * sizeof(float), NX, NY);
  printf("cudaPitchedPtr: %s\n", cudaGetErrorString(cudaGetLastError()));

  cudaExtent extent = make_cudaExtent(NX * sizeof(float), NY, NZ);;

  cudaMalloc3D(&dstPtr, extent);
  printf("cudaMalloc3D: %s\n", cudaGetErrorString(cudaGetLastError()));

  cudaMemset3D(dstPtr, 0, extent);
  printf("cudaMemset3D: %s\n", cudaGetErrorString(cudaGetLastError()));

  cudaPitchedPtr srcPtr = make_cudaPitchedPtr((void**)&h_data, NX * sizeof(float), NX, NY);
  printf("cudaPitchedPtr: %s\n", cudaGetErrorString(cudaGetLastError()));

  cudaMemcpy3DParms params = {0};
  params.srcPtr = srcPtr;
  params.dstPtr = dstPtr;
  params.extent = extent;
  params.kind = cudaMemcpyHostToDevice;

  cudaMemcpy3D(&params);
  printf("cudaMemcpy3D: %s\n", cudaGetErrorString(cudaGetLastError()));

  delete[] h_data;
  return 0;
}
joelmeans
  • 122
  • 1
  • 1
  • 8

1 Answers1

2

There are several problems here. In no particular order:

  1. Pass the host pointer for the source memory by value, not by reference to cudaMakePitchedPtr. This is probably the main problem in this code
  2. The cudaMakePitchedPtr call for the destination device pointer is superfluous. The cudaMalloc3d call will make the pitched pointer based on the extent you provide and whatever requirements the driver and device impose on the allocation
  3. Your error checking isn't correct. Don't use cudaGetLastError for the API calls which can generate an error. They return an error code directly. Use that status instead (see here for a useful way to check for errors with the runtime API). Note that helper routines like make_cudaExtent and make_cudaPitchedPtr don't modify the runtime API status which cudaGetLastError() uses, so those error check calls are also redundant.

After you fix those things, you might have code which looks like this:

#include <cstdio>
#include <cuda_runtime.h>

int main() {
  static const size_t NX = 60; 
  static const size_t NY = 60; 
  static const size_t NZ = 60; 

  float* h_data = new float[NX * NY * NZ];
  for(unsigned int i = 0; i < NX * NY * NZ; ++i) {
    h_data[i] = static_cast<float>(i);
  }

  cudaPitchedPtr srcPtr = make_cudaPitchedPtr(h_data, NX * sizeof(float), NX, NY);

  cudaPitchedPtr dstPtr;
  cudaExtent extent = make_cudaExtent(NX * sizeof(float), NY, NZ);;
  cudaMalloc3D(&dstPtr, extent);
  cudaMemset3D(dstPtr, 0, extent);

  cudaMemcpy3DParms params = {0};
  params.srcPtr = srcPtr;
  params.dstPtr = dstPtr;
  params.extent = extent;
  params.kind = cudaMemcpyHostToDevice;

  cudaMemcpy3D(&params);
  printf("cudaMemcpy3D: %s\n", cudaGetErrorString(cudaGetLastError()));

  delete[] h_data;
  return 0;
}

and you might find that it works as expected.

Community
  • 1
  • 1
talonmies
  • 70,661
  • 34
  • 192
  • 269
  • Thanks. That did the trick. I wish the documentation was more clear on things. On the error checking, I typically use what you linked to, but just did the quick printouts for this example. The info about which calls will actually return codes is useful, though. – joelmeans Jan 28 '14 at 19:23