Odd behavior of cudaMemcpyAsync: 1. cudaMemcpyKind makes no difference. 2. Copy fails, but silently

Question

I'm familiarizing myself with a new cluster equipped with Pascal P100 GPUs+Nvlink. I wrote a ping-pong program to test gpu<->gpu and gpu<->cpu bandwidths and peer-to-peer access. (I'm aware the cuda samples contain such a program, but I wanted to do it myself for better understanding.) Nvlink bandwidths appear reasonable (~35 GB/s bidirectional, with the theoretical maximum being 40). However, while debugging the ping-pong I discovered some odd behavior.

First of all, cudaMemcpyAsync succeeds no matter what cudaMemcpyKind I specify, for example, if cudaMemcpyAsync is copying memory from host to device, it will succeed even if I pass cudaMemcpyDeviceToHost as the kind.

Secondly, when host memory is not page locked, cudaMemcpyAsync does the following:

Copying memory from the host to the device appears to succeed (no segfaults or cuda runtime errors, and the data appears to transfer properly).
Copying memory from the device to the host fails silently: no segfault occurs, and cudaDeviceSynchronize after the memcpy returns cudaSuccess, but checking the data reveals that data on the gpu did not transfer properly to the host.

Is this behavior to be expected? I have included a minimal working sample code that demonstrates it on my system (the sample is not the ping-pong app, all it does is test cudaMemcpyAsync with various parameters).

The P100s have UVA enabled, so it is plausible to me that cudaMemcpyAsync is simply inferring the locations of the src and dst pointers and ignoring the cudaMemcpyKind argument. However, I'm not sure why cudaMemcpyAsync fails to throw an error for non-page-locked host memory. I was under the impression that was a strict no-no.

#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
  int tid = threadIdx.x + blockIdx.x*blockDim.x;
  for( int i = tid; i < n; i += blockDim.x*gridDim.x )
  {
    if( current[i] != expected_current_val )
      printf( "Error on device:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
  for( int i = 0; i < n; i++ )
  {
    if( current[i] != expected_current_val )
      printf( "Error on host:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

int main( int argc, char** argv )
{
  bool pagelocked = true;
  // invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
  // Run with pagelocked memory:  ./a.out
  // Run with ordinary malloc'd memory: ./a.out jkfdlsja
  if( argc > 1 )
    pagelocked = false;

  int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.

  cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
  cudaStreamCreate( stream );

  int* srcHost;
  int* dstHost;
  int* srcDevice;
  int* dstDevice;

  cudaMalloc( (void**)&srcDevice, copybytes );
  cudaMalloc( (void**)&dstDevice, copybytes );
  if( pagelocked )
  {
    printf( "Using page locked memory\n" );
    cudaMallocHost( (void**)&srcHost, copybytes );
    cudaMallocHost( (void**)&dstHost, copybytes );
  }
  else
  {
    printf( "Using non page locked memory\n" );
    srcHost = (int*)malloc( copybytes );
    dstHost = (int*)malloc( copybytes );
  }

  for( int i = 0; i < copybytes/sizeof(int); i++ )
    srcHost[i] = 1;

  cudaMemcpyKind kinds[4];
  kinds[0] = cudaMemcpyHostToDevice;
  kinds[1] = cudaMemcpyDeviceToHost;
  kinds[2] = cudaMemcpyHostToHost;
  kinds[3] = cudaMemcpyDeviceToDevice;

  // Test cudaMemcpyAsync in both directions,
  // iterating through all "cudaMemcpyKinds" to verify
  // that they don't matter.
  int expected_current_val = 1;
  for( int kind = 0; kind<4; kind++ )
  {
    // Host to device copy 
    cudaMemcpyAsync( dstDevice
        , srcHost
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataDevice<<<56*8,256>>>( dstDevice
        , srcDevice
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;

    // Device to host copy
    cudaMemcpyAsync( dstHost
        , srcDevice
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataHost( dstHost
        , srcHost
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;
  }

  free( stream );

  cudaFree( srcDevice );
  cudaFree( dstDevice );
  if( pagelocked )
  {
    cudaFreeHost( srcHost );
    cudaFreeHost( dstHost );
  }
  else
  {
    free( srcHost );
    free( dstHost );
  }

  return 0;
}

`I'm not sure why cudaMemcpyAsync fails to throw an error for non-page-locked host memory. I was under the impression that was a strict no-no.` That's not the case. It would not throw an error if host areas are not page-locked, it will simply revert to non-Async behavior (meaning, for example, it could not overlap that particular copy with a kernel execution or another copy, for example.) — Robert Crovella, Jun 15 '17 at 20:36

Robert Crovella · Accepted Answer · 2017-06-15T22:55:11.767

When having trouble with a CUDA code, I strongly recommend using rigorous (== every single call return code is checked) proper CUDA error checking.

Your error checking is flawed, and the flaws are leading to some of your confusion.

First of all, in the page-locked case, a given (mapped) pointer is accessible/valid on both the host and the device. Therefore every possible enumeration of the direction (H2D, D2H, D2D, H2H) is legal and valid. As a result, no errors will be returned and the copy operation is successful.

In the non-page-locked case, the above is not true, so generally speaking the indicated transfer direction had better match the implied transfer direction, as inspected from the pointers. If it does not, the cudaMemcpyAsync will return an error code (cudaErrorInvalidValue == 11). In your case, you are ignoring this error result. You can prove this to yourself, if you have enough patience (it would be better if you just flagged the first error, rather than printing out every mismatch in 10M+ elements), by running your code with cuda-memcheck (another good thing to do whenever you are having trouble with a CUDA code) or else just do proper, rigorous error checking.

When the cudaMemcpyAsync operation indicates a failure, the operation does not complete successfully, so the data is not copied, and your data checking indicates mismatches. Hopefully this is now not surprising, since the expected copy operation actually did not happen (nor did it fail "silently").

Perhaps you are confused thinking that the way to catch an error on any sort of Async operation is to do a cudaDeviceSynchronize and then check for errors on that.

This is not correct for cudaMemcpyAsync. An error which can be detected at invocation of the cudaMemcpyAsync operation will be returned immediately by the call itself, and will not be returned as a result of subsequent CUDA calls (clearly) since this type of error is non-sticky.

The moral of the story:

DO proper CUDA error checking. Rigorously.
Run your code with cuda-memcheck.

Here's a fully worked example, with a trivial modification to your code to make the output "sane" in the failing case, demonstrating that there is an error indicated in the failing case:

$ cat t153.cu
#include <stdio.h>
#include <stdlib.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
  int tid = threadIdx.x + blockIdx.x*blockDim.x;
  for( int i = tid; i < n; i += blockDim.x*gridDim.x )
  {
    if( current[i] != expected_current_val )
      printf( "Error on device:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
  for( int i = 0; i < n; i++ )
  {
    if( current[i] != expected_current_val ){
      printf( "Error on host:  expected = %d, current[%d] = %d\n"
          , expected_current_val
          , i
          , current[i] );
      exit(0);}
    // Increment the data so the next copy is properly tested
    next[i] = current[i] + 1;
  }
}

int main( int argc, char** argv )
{
  bool pagelocked = true;
  // invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
  // Run with pagelocked memory:  ./a.out
  // Run with ordinary malloc'd memory: ./a.out jkfdlsja
  if( argc > 1 )
    pagelocked = false;

  int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.

  cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
  cudaStreamCreate( stream );

  int* srcHost;
  int* dstHost;
  int* srcDevice;
  int* dstDevice;

  cudaMalloc( (void**)&srcDevice, copybytes );
  cudaMalloc( (void**)&dstDevice, copybytes );
  if( pagelocked )
  {
    printf( "Using page locked memory\n" );
    cudaMallocHost( (void**)&srcHost, copybytes );
    cudaMallocHost( (void**)&dstHost, copybytes );
  }
  else
  {
    printf( "Using non page locked memory\n" );
    srcHost = (int*)malloc( copybytes );
    dstHost = (int*)malloc( copybytes );
  }

  for( int i = 0; i < copybytes/sizeof(int); i++ )
    srcHost[i] = 1;

  cudaMemcpyKind kinds[4];
  kinds[0] = cudaMemcpyHostToDevice;
  kinds[1] = cudaMemcpyDeviceToHost;
  kinds[2] = cudaMemcpyHostToHost;
  kinds[3] = cudaMemcpyDeviceToDevice;

  // Test cudaMemcpyAsync in both directions,
  // iterating through all "cudaMemcpyKinds" to verify
  // that they don't matter.
  int expected_current_val = 1;
  for( int kind = 0; kind<4; kind++ )
  {
    // Host to device copy
    cudaMemcpyAsync( dstDevice
        , srcHost
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataDevice<<<56*8,256>>>( dstDevice
        , srcDevice
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;

    // Device to host copy
    cudaMemcpyAsync( dstHost
        , srcDevice
        , copybytes
        , kinds[kind]
        , *stream );
    gpuErrchk( cudaDeviceSynchronize() );

    checkDataHost( dstHost
        , srcHost
        , expected_current_val
        , copybytes/sizeof(int) );
    expected_current_val++;
  }

  free( stream );

  cudaFree( srcDevice );
  cudaFree( dstDevice );
  if( pagelocked )
  {
    cudaFreeHost( srcHost );
    cudaFreeHost( dstHost );
  }
  else
  {
    free( srcHost );
    free( dstHost );
  }

  return 0;
}
$ nvcc -arch=sm_61 -o t153 t153.cu
$ cuda-memcheck ./t153 a
========= CUDA-MEMCHECK
Using non page locked memory
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpyAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2ef423]
=========     Host Frame:./t153 [0x489a3]
=========     Host Frame:./t153 [0x2e11]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
=========     Host Frame:./t153 [0x2a49]
=========
Error on host:  expected = 2, current[0] = 0
========= ERROR SUMMARY: 1 error
$

Understood, my mistake. Based on [the statement](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g10e20b05a95f638a4071a655503df25d) that cudaDeviceSynchronize() `may also return error codes from previous, asynchronous launches,` I thought that checking the error on cudaDeviceSynchronize() was sufficient. I did not realize the error was non-sticky -- this is valuable information. — Michael Carilli, Jun 15 '17 at 22:29
may is not the same as will or must, in this case, and arguably `cudaMemcpyAsync` is not a *launch* — Robert Crovella, Jun 15 '17 at 22:31

Odd behavior of cudaMemcpyAsync: 1. cudaMemcpyKind makes no difference. 2. Copy fails, but silently

1 Answers1