Intercepting CUDA calls

Question

I am trying to intercept cudaMemcpy calls from the pytorch library for analysis. I noticed NVIDIA has a cuHook example in the CUDA toolkit samples. However that example requires one to modify the source code of the application itself which I cannot do in this case. So is there a way to write a hook to intercept CUDA calls without modifying the application source code?

How will you get your source code into the application without modifying the application source code? — user253751, Sep 16 '20 at 17:00
How you do this will depend on what platform you use. And it would require that your PyTorch build uses the dynamically linked version of the CUDA runtime API library. There is no single answer to this question. For example https://stackoverflow.com/a/6083624/681865 — talonmies, Sep 17 '20 at 07:41

Robert Crovella · Accepted Answer · 2020-09-18T08:51:44.910

A CUDA runtime API call can be hooked (on linux) using the "LD_PRELOAD trick" if the application that is being run is dynamically linked to the CUDA runtime library (libcudart.so).

Here is a simple example on linux:

$ cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind  ))dlsym(RTLD_NEXT, "cudaMemcpy");
    printf("cudaMemcpy hooked\n");
    return lcudaMemcpy( dst, src, count, kind );
}

cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t   ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
    printf("cudaMemcpyAsync hooked\n");
    return lcudaMemcpyAsync( dst, src, count, kind, str );
}
$ g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
$ cat t1.cu
#include <stdio.h>

int main(){

  int a, *d_a;
  cudaMalloc(&d_a, sizeof(d_a[0]));
  cudaMemcpy(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
  cudaStream_t str;
  cudaStreamCreate(&str);
  cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice);
  cudaMemcpyAsync(d_a, &a, sizeof(a), cudaMemcpyHostToDevice, str);
  cudaDeviceSynchronize();
}
$ nvcc -o t1 t1.cu -cudart shared
$ LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so cuda-memcheck ./t1
========= CUDA-MEMCHECK
cudaMemcpy hooked
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
========= ERROR SUMMARY: 0 errors
$

(CentOS 7, CUDA 10.2)

A simple test with pytorch seems to indicate that it works:

$ docker run --gpus all -it nvcr.io/nvidia/pytorch:20.08-py3
...
Status: Downloaded newer image for nvcr.io/nvidia/pytorch:20.08-py3

=============
== PyTorch ==
=============

NVIDIA Release 20.08 (build 15516749)
PyTorch Version 1.7.0a0+8deb4fe

Container image Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.

Copyright (c) 2014-2020 Facebook Inc.
Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU                      (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2015      Google Inc.
Copyright (c) 2015      Yangqing Jia
Copyright (c) 2013-2016 The Caffe contributors
All rights reserved.

Various files include modifications (c) NVIDIA CORPORATION.  All rights reserved.
NVIDIA modifications are covered by the license terms that apply to the underlying project or file.

NOTE: MOFED driver for multi-node communication was not detected.
      Multi-node communication performance may be reduced.

NOTE: The SHMEM allocation limit is set to the default of 64MB.  This may be
   insufficient for PyTorch.  NVIDIA recommends the use of the following flags:
   nvidia-docker run --ipc=host ...

...
root@946934df529b:/workspace# cat mylib.cpp
#include <stdio.h>
#include <unistd.h>
#include <dlfcn.h>
#include <cuda_runtime.h>

cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
{
cudaError_t (*lcudaMemcpy) ( void*, const void*, size_t, cudaMemcpyKind) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind  ))dlsym(RTLD_NEXT, "cudaMemcpy");
    printf("cudaMemcpy hooked\n");
    return lcudaMemcpy( dst, src, count, kind );
}

cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t str )
{
cudaError_t (*lcudaMemcpyAsync) ( void*, const void*, size_t, cudaMemcpyKind, cudaStream_t) = (cudaError_t (*) ( void* , const void* , size_t , cudaMemcpyKind, cudaStream_t   ))dlsym(RTLD_NEXT, "cudaMemcpyAsync");
    printf("cudaMemcpyAsync hooked\n");
    return lcudaMemcpyAsync( dst, src, count, kind, str );
}
root@946934df529b:/workspace# g++ -I/usr/local/cuda/include -fPIC -shared -o libmylib.so mylib.cpp -ldl -L/usr/local/cuda/lib64 -lcudart
root@946934df529b:/workspace# cat tt.py
import torch
device = torch.cuda.current_device()
x = torch.randn(1024, 1024).to(device)
y = torch.randn(1024, 1024).to(device)
z = torch.matmul(x, y)
root@946934df529b:/workspace# LD_LIBRARY_PATH=/usr/local/cuda/lib64 LD_PRELOAD=./libmylib.so python tt.py
cudaMemcpyAsync hooked
cudaMemcpyAsync hooked
root@946934df529b:/workspace#

(using NVIDIA NGC PyTorch container )

Intercepting CUDA calls

1 Answers1