I'm familiarizing myself with a new cluster equipped with Pascal P100 GPUs+Nvlink. I wrote a ping-pong program to test gpu<->gpu and gpu<->cpu bandwidths and peer-to-peer access. (I'm aware the cuda samples contain such a program, but I wanted to do it myself for better understanding.) Nvlink bandwidths appear reasonable (~35 GB/s bidirectional, with the theoretical maximum being 40). However, while debugging the ping-pong I discovered some odd behavior.
First of all, cudaMemcpyAsync succeeds no matter what cudaMemcpyKind I specify, for example, if cudaMemcpyAsync is copying memory from host to device, it will succeed even if I pass cudaMemcpyDeviceToHost as the kind.
Secondly, when host memory is not page locked, cudaMemcpyAsync does the following:
- Copying memory from the host to the device appears to succeed (no segfaults or cuda runtime errors, and the data appears to transfer properly).
- Copying memory from the device to the host fails silently: no segfault occurs, and cudaDeviceSynchronize after the memcpy returns cudaSuccess, but checking the data reveals that data on the gpu did not transfer properly to the host.
Is this behavior to be expected? I have included a minimal working sample code that demonstrates it on my system (the sample is not the ping-pong app, all it does is test cudaMemcpyAsync with various parameters).
The P100s have UVA enabled, so it is plausible to me that cudaMemcpyAsync is simply inferring the locations of the src and dst pointers and ignoring the cudaMemcpyKind argument. However, I'm not sure why cudaMemcpyAsync fails to throw an error for non-page-locked host memory. I was under the impression that was a strict no-no.
#include <stdio.h>
#include <cuda_runtime.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void checkDataDevice( int* current, int* next, int expected_current_val, int n )
{
int tid = threadIdx.x + blockIdx.x*blockDim.x;
for( int i = tid; i < n; i += blockDim.x*gridDim.x )
{
if( current[i] != expected_current_val )
printf( "Error on device: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
void checkDataHost( int* current, int* next, int expected_current_val, int n )
{
for( int i = 0; i < n; i++ )
{
if( current[i] != expected_current_val )
printf( "Error on host: expected = %d, current[%d] = %d\n"
, expected_current_val
, i
, current[i] );
// Increment the data so the next copy is properly tested
next[i] = current[i] + 1;
}
}
int main( int argc, char** argv )
{
bool pagelocked = true;
// invoking the executable with any additional argument(s) will turn off page locked memory, i.e.,
// Run with pagelocked memory: ./a.out
// Run with ordinary malloc'd memory: ./a.out jkfdlsja
if( argc > 1 )
pagelocked = false;
int copybytes = 1e8; // Ok to use int instead of size_t for 1e8.
cudaStream_t* stream = (cudaStream_t*)malloc( sizeof(cudaStream_t) );
cudaStreamCreate( stream );
int* srcHost;
int* dstHost;
int* srcDevice;
int* dstDevice;
cudaMalloc( (void**)&srcDevice, copybytes );
cudaMalloc( (void**)&dstDevice, copybytes );
if( pagelocked )
{
printf( "Using page locked memory\n" );
cudaMallocHost( (void**)&srcHost, copybytes );
cudaMallocHost( (void**)&dstHost, copybytes );
}
else
{
printf( "Using non page locked memory\n" );
srcHost = (int*)malloc( copybytes );
dstHost = (int*)malloc( copybytes );
}
for( int i = 0; i < copybytes/sizeof(int); i++ )
srcHost[i] = 1;
cudaMemcpyKind kinds[4];
kinds[0] = cudaMemcpyHostToDevice;
kinds[1] = cudaMemcpyDeviceToHost;
kinds[2] = cudaMemcpyHostToHost;
kinds[3] = cudaMemcpyDeviceToDevice;
// Test cudaMemcpyAsync in both directions,
// iterating through all "cudaMemcpyKinds" to verify
// that they don't matter.
int expected_current_val = 1;
for( int kind = 0; kind<4; kind++ )
{
// Host to device copy
cudaMemcpyAsync( dstDevice
, srcHost
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataDevice<<<56*8,256>>>( dstDevice
, srcDevice
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
// Device to host copy
cudaMemcpyAsync( dstHost
, srcDevice
, copybytes
, kinds[kind]
, *stream );
gpuErrchk( cudaDeviceSynchronize() );
checkDataHost( dstHost
, srcHost
, expected_current_val
, copybytes/sizeof(int) );
expected_current_val++;
}
free( stream );
cudaFree( srcDevice );
cudaFree( dstDevice );
if( pagelocked )
{
cudaFreeHost( srcHost );
cudaFreeHost( dstHost );
}
else
{
free( srcHost );
free( dstHost );
}
return 0;
}