I am trying to learn CUDA and I am now stuck at running a simple nvprof command.
I am testing a simple script in both C++ and Fortran using CUDA. The CUDA kernels test two different ways of performing a simple task with the intent to show the importance of the branch divergence issue.
When I run
nvprof --metrics branch_efficiency ./codeCpp.x
(i.e., on the c++ code) the command works but when I try the same thing on the corresponding fortran code, it doesn't. What is curious is that a simple <nvprof ./codeFortran.x> on the fortran executable will show an output, but anything with the <--metrics> flag will not.
Below I paste some info: (note both codes compile fine and do not produce any runtime error).
I am using Ubuntu 20
Anyone can help to understand this issue? Thank you
===================== c++ code
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"
#include "device_launch_parameters.h"
#include "cuda_common.cuh"
// kernel without divergence
__global__ void code_without_divergence(){
// compute unique grid index
int gid = blockIdx.x * blockDim.x + threadIdx.x;
// define some local variables
float a, b;
a = b = 0.0;
// compute the warp index
int warp_id = gid/32;
// conditional statement based on the warp id
if (warp_id % 2 == 0)
{
a = 150.0;
b = 50.0;
}
else
{
a = 200.0;
b = 75.0;
};
}
// kernel with divergence
__global__ void code_with_divergence(){
// compute unique grid index
int gid = blockIdx.x * blockDim.x + threadIdx.x;
// define some local variables
float a, b;
a = b = 0.0;
// conditional statement based on the gid. This will force difference
// code branches within the same warp.
if (gid % 2 == 0)
{
a = 150.0;
b = 50.0;
}
else
{
a = 200.0;
b = 75.0;
};
}
int main (int argc, char** argv){
// set the block size
int size = 1 << 22;
dim3 block_size(128);
dim3 grid_size((size + block_size.x-1)/block_size.x);
code_without_divergence <<< grid_size, block_size>>>();
cudaDeviceSynchronize();
code_with_divergence <<<grid_size, block_size>>>();
cudaDeviceSynchronize();
cudaDeviceReset();
return EXIT_SUCCESS;
};
================ Fortran code
MODULE CUDAUtils
USE cudafor
IMPLICIT NONE
CONTAINS
! code without divergence routine
ATTRIBUTES(GLOBAL) SUBROUTINE code_without_divergence()
IMPLICIT NONE
!> local variables
INTEGER :: threadId, warpIdx
REAL(KIND=8) :: a,b
! get the unique threadID
threadId = (blockIdx%y-1) * gridDim%x * blockDim%x + &
(blockIdx%x-1) * blockDim%x + (threadIdx%x-1)
! adjust so that the threadId starts from 1
threadId = threadId + 1
! warp index
warpIdx = threadIdx%x/32
! perform the conditional statement
IF (MOD(warpIdx,2) == 0) THEN
a = 150.0D0
b = 50.0D0
ELSE
a = 200.0D0
b = 75.0D0
END IF
END SUBROUTINE code_without_divergence
! code with divergence routine
ATTRIBUTES(GLOBAL) SUBROUTINE code_with_divergence()
IMPLICIT NONE
!> local variables
INTEGER :: threadId, warpIdx
REAL(KIND=8) :: a,b
! get the unique threadID
threadId = (blockIdx%y-1) * gridDim%x * blockDim%x + &
(blockIdx%x-1) * blockDim%x + (threadIdx%x-1)
! adjust so that the threadId starts from 1
threadId = threadId + 1
! perform the conditional statement
IF (MOD(threadId,2) == 0) THEN
a = 150.0D0
b = 50.0D0
ELSE
a = 200.0D0
b = 75.0D0
END IF
END SUBROUTINE code_with_divergence
END MODULE CUDAUtils
PROGRAM main
USE CUDAUtils
IMPLICIT NONE
! define the variables
INTEGER :: size1 = 1e20
INTEGER :: istat
TYPE(DIM3) :: grid, tBlock
! blocksize is 42 along the 1st dimension only whereas grid is 2D
tBlock = DIM3(128,1,1)
grid = DIM3((size1 + tBlock%x)/tBlock%x,1,1)
! just call the module
CALL code_without_divergence<<<grid,tBlock>>>()
istat = cudaDeviceSynchronize()
! just call the module
CALL code_with_divergence<<<grid,tBlock>>>()
istat = cudaDeviceSynchronize()
STOP
END PROGRAM main
- Output of
nvprof --metrics branch_efficiency ./codeCpp.x
=6944== NVPROF is profiling process 6944, command: ./codeCpp.x
==6944== Profiling application: ./codeCpp.x
==6944== Profiling result:
==6944== Metric result:
Invocations Metric Name Metric Description Min Max Avg
Device "NVIDIA GeForce MX330 (0)"
Kernel: code_without_divergence(void)
1 branch_efficiency Branch Efficiency 100.00% 100.00% 100.00%
Kernel: code_with_divergence(void)
1 branch_efficiency Branch Efficiency 85.71% 85.71% 85.71%
- Output of
nvprof --metrics branch_efficiency ./codeFortran.x
==6983== NVPROF is profiling process 6983, command: ./codeFortran.x
==6983== Profiling application: ./codeFortran.x
==6983== Profiling result:
No events/metrics were profiled.
- Output of
nvprof ./codeFortran.x
==7002== NVPROF is profiling process 7002, command: ./codeFortran.x
==7002== Profiling application: ./codeFortran.x
==7002== Profiling result:
No kernels were profiled.
Type Time(%) Time Calls Avg Min Max Name
API calls: 99.82% 153.45ms 2 76.726ms 516ns 153.45ms cudaLaunchKernel
0.15% 231.24us 101 2.2890us 95ns 172.81us cuDeviceGetAttribute
0.01% 22.522us 1 22.522us 22.522us 22.522us cuDeviceGetName
0.01% 9.1310us 1 9.1310us 9.1310us 9.1310us cuDeviceGetPCIBusId
0.00% 5.4500us 2 2.7250us 876ns 4.5740us cudaDeviceSynchronize
0.00% 1.3480us 3 449ns 195ns 903ns cuDeviceGetCount
0.00% 611ns 1 611ns 611ns 611ns cuModuleGetLoadingMode
0.00% 520ns 2 260ns 117ns 403ns cuDeviceGet
0.00% 245ns 1 245ns 245ns 245ns cuDeviceTotalMem
0.00% 187ns 1 187ns 187ns 187ns cuDeviceGetUuid
Both the c++ and Fortran executables test the same CUDA concept. They both compile fine and no runtime errors are shown on the terminal upon execution.
When I try the nvprof
command on the c++ program, everything works as expected but when I try it on the corresponding fortran program, there is no output (using the --metrics
flag). I would expect the same behavior obtained with the c++ code.
0 In some other discussions I found that for GPU version above 7, nvprof is no longer supported and NVIDIA Nsight should be used, however i do not think this is the case because i get the expected output with the c++ program.