I'd like to experiment with MPS on Nvidia GPUs, therefore I'd like to be able to profile two process running in parallel. With the, now deprecated, nvprof, there used to be an option "--profile-all-processes". Is there a equivalent for nsys ?
I tried generating multiple report with MPS OFF and them importing them on the same timeline with this code (from this question) :
#include <stdio.h>
#include <stdlib.h>
#define MAX_DELAY 30
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
#define APPRX_CLKS_PER_SEC 1000000000ULL
__global__ void delay_kernel(unsigned seconds){
unsigned long long dt = clock64();
while (clock64() < (dt + (seconds*APPRX_CLKS_PER_SEC)));
}
int main(int argc, char *argv[]){
cudaSetDevice(0);
unsigned delay_t = 10; // seconds, approximately
unsigned delay_t_r;
if (argc > 1) delay_t_r = atoi(argv[1]);
if ((delay_t_r > 0) && (delay_t_r < MAX_DELAY)) delay_t = delay_t_r;
unsigned long long difft = dtime_usec(0);
for (int i = 0; i < 3;i++) {
delay_kernel<<<1,1>>>(delay_t);
cudaDeviceSynchronize();
}
cudaCheckErrors("kernel fail");
difft = dtime_usec(difft);
printf("kernel duration: %fs\n", difft/(float)USECPSEC);
cudaFree(0);
return 0;
}
And this script :
nvcc -o t1034 t1034.cu
nsys profile -o rep1 -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -f true -x true ./t1034 &
nsys profile -o rep2 -w true -t cuda,nvtx,osrt,cudnn,cublas -s none -f true -x true ./t1034
I then open rep1.qdrep and add rep2.qdrep to it which produces the following timeline : Nsys Timeline
But I expected someting more like this : Reference
Am I doing something wrong ? Is this the correct result ?
(Sidenote, I'm running this example inside the nvcr.io/nvidia/tensorrt:20.12-py3 docker)