Error compiling CUDA dynamic parallelism code with Driver API

Question

Am trying to implement the NestedHelloWorld as an example of dynamic parallelism. The environ is VS2019, must implement as Driver API interface, in C. The codes are

CUdevice* gpu_initialize(CUdevice *dev_ptr, int int_of_dev);        //signature of the init function

// Host code
int main()
{
printf("Nested Parallelism : Hello World (Driver API)\n");

CUdevice device;
if (gpu_initialize(&device,0) == NULL) {                //Initialize the device[0] and populate the device pointer contents
    printf("Error Initializing GPU... exiting");
    exit(-1);
}

CUresult result;
char* error_string;

//Create a context
CUcontext  context;
unsigned int flags = CU_CTX_SCHED_YIELD;            //set to mode where the GPU yields when awaiting CPU : increases latency
if ((result = cuCtxCreate(&context,flags,device)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error creating context : %s\n", error_string);
        return -1;
    }
    else {
        printf("Unknown error creating context\n");
        return -1;
    }
}

//Load the module by specifying the filename
CUmodule module;
char filename[] = "C:\\Users\\gautam\\OneDrive\\Projects\\VS 2019\\repos\\learn_cuda_nesting\\dynamic_parallelism\\x64\\Debug\\nestedHelloWorld.cu.obj";        //Use for VS 2019 compilation

if ((result = cuModuleLoad(&module,filename)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading Module using filename %s: %s \n",filename,error_string);
        return -1;
    }
    else {
        printf("Unknown error loading Module from filename %s\n",filename);
        return -1;
    }
}
else printf("Successfully loaded module %s\n", filename);

//Load the function from the module
CUfunction function;
char        function_name[120];
strcpy_s(function_name,120,"nestedHelloWorld");
if ((result = cuModuleGetFunction(&function,module,function_name)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error loading function %s: %s\n",function_name, error_string);
        return -1;
    }
    else {
        printf("Unknown error loading function %s\n",function_name);
        return -1;
    }
}
else printf("Successfully loaded function %s\n", function_name);

//Set up kernel grid parameters

int size = 8;
int blocksize = 8;   // initial block size
int igrid = 1;
int threads_per_block_x = 8;
int threads_per_block_y = 1;
int blocks_per_grid_x = (size + threads_per_block_x - 1) / threads_per_block_x;
int blocks_per_grid_y = 1;

//Launch the first function in the kernel
//Case 1 : Invoke kernel with 8 x 1 grid size
void* args[] = {(void *)&size, (void *)&igrid};

if ((result = cuLaunchKernel(function, blocks_per_grid_x, blocks_per_grid_y, 1, threads_per_block_x, threads_per_block_y, 1, 0, 0, args, 0)) != CUDA_SUCCESS) {
    if (cuGetErrorName(result, &error_string) != CUDA_ERROR_INVALID_VALUE) {
        printf("Error launching kernel %s: %s\n",function_name, error_string);
        result = cuCtxDestroy(context);
        return -1;
    }
    else {
        printf("Unknown error launching kernel %s\n", function_name);
        result = cuCtxDestroy(context);
        return -1;
    }
}
else printf("CUDA kernel launch with (%d,%d) blocks per grid, each with (%d,%d) threads per block\n", blocks_per_grid_x, blocks_per_grid_y, threads_per_block_x, threads_per_block_y);

result = cuCtxSynchronize();        //wait for kernel run to finish
result = cuCtxDestroy(context);
exit(0);
}

and the corresponding .cu file code is :

extern "C" __global__ void nestedHelloWorld(int const iSize, int iDepth)
{
int tid = threadIdx.x;
printf("Recursion=%d: Hello World from thread %d block %d\n", iDepth, tid,blockIdx.x);

// condition to stop recursive execution
if (iSize == 1) return;

// reduce block size to half
int nthreads = iSize >> 1;

// thread 0 launches child grid recursively
if(tid == 0 && nthreads > 0)
{
    //nestedHelloWorld(nthreads, ++iDepth);
    nestedHelloWorld<<<1, nthreads>>>(nthreads, ++iDepth);
    printf("-------> nested execution depth: %d\n", iDepth);
}
}

The settings are as follows :

and finally, the gpu parameters and the error saying invalid PTX.

While this appears to be the "elementary code", the real intent is to identify what compilation parameter error(s) are being done. The compilation goes through without any errors.

All help is appreciated deeply...

Edit : Have added the successful compilation screenshot, and the screenshot of external command used in a separate attempt to compile the file and then link. The error has not changed.

To respond to first part - the compilation of .cu.obj from the source is part of VS 2019 "Build" or "Rebuild" instruction.. Second part - I have tried a standalone compilation using nvcc with -rdc=true and that has the same error. Am editing the question to add these two facets... Tks — quasar66, Sep 10 '21 at 14:41

Robert Crovella · Answer 1 · 2021-09-11T18:06:26.573

We can build a complete example here by starting with the ptxjit CUDA sample code and then follow the additional instructions here. That's essentially everything you need.

Here's a full worked example on linux, including Makefile:

$ cat kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat ptxjit.cpp

/*
 */

// System includes
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <iostream>

// CUDA driver & runtime
#include <cuda.h>
#include <cuda_runtime.h>

// helper functions and utilities to work with CUDA
#define CUDA_DRIVER_API
#include <helper_cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>  // helper for shared that are common to CUDA Samples

#define PTX_FILE "kernel.ptx"

const char *sSDKname = "CDP Recursion Test (Driver API)";

bool inline findModulePath(const char *module_file, std::string &module_path,
                           char **argv, std::string &ptx_source) {
  char *actual_path = sdkFindFilePath(module_file, argv[0]);

  if (actual_path) {
    module_path = actual_path;
  } else {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  }

  if (module_path.empty()) {
    printf("> findModulePath file not found: <%s> \n", module_file);
    return false;
  } else {
    printf("> findModulePath <%s>\n", module_path.c_str());

    if (module_path.rfind(".ptx") != std::string::npos) {
      FILE *fp = fopen(module_path.c_str(), "rb");
      fseek(fp, 0, SEEK_END);
      int file_size = ftell(fp);
      char *buf = new char[file_size + 1];
      fseek(fp, 0, SEEK_SET);
      fread(buf, sizeof(char), file_size, fp);
      fclose(fp);
      buf[file_size] = '\0';
      ptx_source = buf;
      delete[] buf;
    }

    return true;
  }
}

void ptxJIT(int argc, char **argv, CUmodule *phModule, CUfunction *phKernel,
            CUlinkState *lState) {
  const int options_num = 5;
  CUjit_option options[options_num];
  void *optionVals[options_num];
  float walltime;
  char error_log[8192], info_log[8192];
  unsigned int logSize = 8192;
  void *cuOut;
  size_t outSize;
  int myErr = 0;
  std::string module_path, ptx_source;

  // Setup linker options
  // Return walltime from JIT compilation
  options[0] = CU_JIT_WALL_TIME;
  optionVals[0] = (void *)&walltime;
  // Pass a buffer for info messages
  options[1] = CU_JIT_INFO_LOG_BUFFER;
  optionVals[1] = (void *)info_log;
  // Pass the size of the info buffer
  options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
  optionVals[2] = (void *)(long)logSize;
  // Pass a buffer for error message
  options[3] = CU_JIT_ERROR_LOG_BUFFER;
  optionVals[3] = (void *)error_log;
  // Pass the size of the error buffer
  options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
  optionVals[4] = (void *)(long)logSize;

  // Create a pending linker invocation
  checkCudaErrors(cuLinkCreate(options_num, options, optionVals, lState));

  // first search for the module path before we load the results
  if (!findModulePath(PTX_FILE, module_path, argv, ptx_source)) {
    printf("> findModulePath could not find <kernel> ptx\n");
    exit(EXIT_FAILURE);
  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }

  // Load the PTX from the ptx file
  printf("Loading ptxjit_kernel[] program\n");
  myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void *)ptx_source.c_str(),
                        strlen(ptx_source.c_str()) + 1, 0, 0, 0, 0);

  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "PTX Linker Error:\n%s\n", error_log);
  }
  myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL);
  if (myErr != CUDA_SUCCESS) {
    // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option
    // above.
    fprintf(stderr, "Library Linker Error:\n%s\n", error_log);
  }
  // Complete the linker step
  checkCudaErrors(cuLinkComplete(*lState, &cuOut, &outSize));

  // Linker walltime and info_log were requested in options above.
  printf("CUDA Link Completed in %fms. Linker Output:\n%s\n", walltime,
         info_log);

  // Load resulting cuBin into module
  checkCudaErrors(cuModuleLoadData(phModule, cuOut));

  // Locate the kernel entry poin
  checkCudaErrors(cuModuleGetFunction(phKernel, *phModule, "k"));

  // Destroy the linker invocation
  checkCudaErrors(cuLinkDestroy(*lState));
}

// Variables
CUcontext cuContext;

int main(int argc, char **argv) {
  const unsigned int nThreads = 1;
  const unsigned int nBlocks = 1;

  CUmodule hModule = 0;
  CUfunction hKernel = 0;
  CUlinkState lState;

  int cuda_device = 0;

  printf("[%s] - Starting...\n", sSDKname);
  // Initialize
  checkCudaErrors(cuInit(0));

  CUdevice dev = findCudaDeviceDRV(argc, (const char **)argv);
  int driverVersion;
  cudaDriverGetVersion(&driverVersion);
  if (driverVersion < CUDART_VERSION) {
    printf("driverVersion = %d < CUDART_VERSION = %d \n"
      "Enhanced compatibility is not supported for this sample.. waving execution\n", driverVersion, CUDART_VERSION);
    exit(EXIT_WAIVED);
  }
  // Create context
  checkCudaErrors(cuCtxCreate(&cuContext, 0, dev));


  // JIT Compile the Kernel from PTX and get the Handles (Driver API)
  ptxJIT(argc, argv, &hModule, &hKernel, &lState);

  // Set the kernel parameters (Driver API)
  dim3 block(nThreads, 1, 1);
  dim3 grid(nBlocks, 1, 1);
  int my_N = 4;
  void *args[1] = {&my_N};

  // Launch the kernel (Driver API_)
  checkCudaErrors(cuLaunchKernel(hKernel, grid.x, grid.y, grid.z, block.x,
                                 block.y, block.z, 0, NULL, args, NULL));
  std::cout << "CUDA kernel launched" << std::endl;

  cuCtxSynchronize();

  if (hModule) {
    checkCudaErrors(cuModuleUnload(hModule));
    hModule = 0;
  }

  return EXIT_SUCCESS;
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

PTX_FILE := kernel.ptx

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 70

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

LIBRARIES += -lcudart_static

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: ptxjit $(PTX_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(PTX_FILE): kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -o $@ -ptx $<

ptxjit.o:ptxjit.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

ptxjit: ptxjit.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./ptxjit

clean:
        rm -f ptxjit ptxjit.o  $(PTX_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f ptxjit ptxjit.o  kernel.ptx
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit.o -c ptxjit.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -o ptxjit ptxjit.o  -L/usr/local/cuda/lib64/stubs -lcuda -lcudart_static
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_70,code=compute_70 -rdc=true -o kernel.ptx -ptx kernel.cu
$ cuda-memcheck ./ptxjit
========= CUDA-MEMCHECK
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath <./kernel.ptx>
> initCUDA loading module: <./kernel.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in 0.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

On windows, you'd follow the same path, using the above files. Start with the ptxjit project. For simplicity you might want to rename the kernel so that it exactly matches the name of the kernel file used in the ptxjit project.

Here's the detailed steps I followed, using VS2019:

Open the ptxjit solution, on my machine it was here: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit
Take the ptxjit.cpp code from the above linux version, and use it to replace the contents of ptxjit.cpp in that solution/project.
Change the define statement back to: #define PTX_FILE "ptxjit_kernel64.ptx"
Change the location of the device runtime library to match your machine. Specifically this line: myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "/usr/local/cuda/lib64/libcudadevrt.a", 0, NULL, NULL); needs to be changed to something like myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1\\lib\\x64\\cudadevrt.lib", 0, NULL, NULL);
In the ptxjit_kernel.cu file in that project, replace the contents of that file with the kernel.cu file contents from the above linux version.
In the solution explorer window, right click on the ptxjit_kernel.cu file and select "Properties". In the "Configuration Properties" pane on the left, expand the CUDA C/C++ section, and select "Common". In the pane on the right, change the "Generate Relocatable Device Code" option from No to Yes. Click "OK".
In the same solution explorer window, right click on the ptxjit project, and select properties. Go into configuration properties...CUDA Linker...General, and change "Perform Device Link" from Yes to No. Click "OK".
Select Build...Rebuild Solution

When I do that, I get build console output like this:

1>------ Rebuild All started: Project: ptxjit, Configuration: Debug x64 ------
1>Compiling CUDA source file ptxjit_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_35,code=\"compute_35,compute_35\" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -ptx -cudart static -Xcompiler "/wd 4819" -o data/ptxjit_kernel64.ptx "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit_kernel.cu"
1>CUDACOMPILE : nvcc warning : The 'compute_35', 'compute_37', 'compute_50', 'sm_35', 'sm_37' and 'sm_50' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
1>ptxjit_kernel.cu
1>Done building project "ptxjit_vs2019.vcxproj".
1>ptxjit.cpp
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(318,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\ptxjit.cpp(324,41): warning C4312: 'type cast': conversion from 'long' to 'void *' of greater size
1>LINK : ..\..\bin\win64\Debug\\ptxjit.exe not found or not built by the last incremental link; performing full link
1>   Creating library ../../bin/win64/Debug/ptxjit.lib and object ../../bin/win64/Debug/ptxjit.exp
1>ptxjit_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe
1>Done building project "ptxjit_vs2019.vcxproj".
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

At that point we can go to the indicated location of the exe:

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\6_Advanced\ptxjit\../../bin/win64/Debug/ptxjit.exe

and run it in a command console. When I do that I see output like this:

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>ptxjit.exe
[CDP Recursion Test (Driver API)] - Starting...
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <ptxjit_kernel64.ptx> in ./
...
sdkFindFilePath <ptxjit_kernel64.ptx> in ../../../6_Advanced/ptxjit/data/
> findModulePath <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
> initCUDA loading module: <../../../6_Advanced/ptxjit/data/ptxjit_kernel64.ptx>
Loading ptxjit_kernel[] program
CUDA Link Completed in -107374176.000000ms. Linker Output:

CUDA kernel launched
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

Notes:

The C4312 warnings in compilation above are in the original project and can be removed by switching from long to long long on the associated lines. This is not an actual problem.
The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the ptx file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the ptx file.

The solution works smoothly. Exploring further - is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface ? This linking slows down runtime of those host threads that need the module considerably. — quasar66, Sep 11 '21 at 10:00

Robert Crovella · Accepted Answer · 2021-09-12T00:22:26.243

Responding to a question posted under my other answer:

is there a way to avoid this "jit linking during runtime" process while still being with Driver API interface

Yes. (I'm providing a separate answer because I ran into the character limit on my previous answer).

In this case, we want to create a fatbin object rather than ptx, during the compilation of the kernel code itself. This fatbin needs to be compiled with -rdc=true as you would expect for the dynamic parallelism, and also needs to be device-linked, together with the CUDA device runtime library.

The host side mechanics in this case are simpler, since we don't need any of the linking steps. The CUDA sample code that seems to be relatively close to this flow is vectorAddDrv so I will start with that code/sample project in order to demonstrate this.

Here is the linux version:

$ cat vectorAdd_kernel.cu
#include <cstdio>
extern "C" __global__ void k(int N)
{
    printf("kernel level %d\n", N);
    if ((N > 1) && (threadIdx.x == 0))  k<<<1,1>>>(N-1);
}
$ cat vectorAddDrv.cpp
// Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <cuda.h>

// includes, project
#include <helper_cuda_drvapi.h>
#include <helper_functions.h>

// includes, CUDA
#include <builtin_types.h>

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction vecAdd_kernel;

// Functions
bool findModulePath(const char *, string &, char **, string &);

//define input fatbin file
#ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif

// Host code
int main(int argc, char **argv)
{
    printf("Linked CDP demo (Driver API)\n");
    int N = 4, devID = 0;

    // Initialize
    checkCudaErrors(cuInit(0));

    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // Create context
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));

    // first search for the module path before we load the results
    string module_path;

    std::ostringstream fatbin;

    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
    {
        exit(EXIT_FAILURE);
    }
    else
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    if (!fatbin.str().size())
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }

    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));

    // Get function handle from module
    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "k"));


        // Grid/Block configuration
        int threadsPerBlock = 1;
        int blocksPerGrid   = 1;

        void *args[] = { &N };

        // Launch the CUDA kernel
        checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));

    checkCudaErrors(cuCtxSynchronize());

    exit(EXIT_SUCCESS);
}
$ cat Makefile
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda

##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
    $(info WARNING - x86_64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
    TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
    $(info WARNING - ARMv7 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=armv7l instead)
    TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
    $(info WARNING - aarch64 variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
    TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
    $(info WARNING - ppc64le variable has been deprecated)
    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
    TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
    $(info WARNING - GCC variable has been deprecated)
    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
    HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
    $(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################

# architecture
HOST_ARCH   := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
            TARGET_SIZE := 64
        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
            TARGET_SIZE := 32
        endif
    else
        TARGET_SIZE := $(shell getconf LONG_BIT)
    endif
else
    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif

# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
    ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
        HOST_ARCH := sbsa
        TARGET_ARCH := sbsa
    endif
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
    endif
endif

# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
    TARGET_ARCH = armv7l
endif

# operating system
HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif

# host compiler
ifeq ($(TARGET_OS),darwin)
    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
        HOST_COMPILER ?= clang++
    endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
        ifeq ($(TARGET_OS),linux)
            HOST_COMPILER ?= arm-linux-gnueabihf-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
        else ifeq ($(TARGET_OS),android)
            HOST_COMPILER ?= arm-linux-androideabi-g++
        endif
    else ifeq ($(TARGET_ARCH),aarch64)
        ifeq ($(TARGET_OS), linux)
            HOST_COMPILER ?= aarch64-linux-gnu-g++
        else ifeq ($(TARGET_OS),qnx)
            ifeq ($(QNX_HOST),)
                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
            endif
            ifeq ($(QNX_TARGET),)
                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
            endif
            export QNX_HOST
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
        else ifeq ($(TARGET_OS), android)
            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),sbsa)
        HOST_COMPILER ?= aarch64-linux-gnu-g++
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
    endif
endif
HOST_COMPILER ?= g++
NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)

# internal flags
NVCCFLAGS   := -m${TARGET_SIZE}
CCFLAGS     :=
LDFLAGS     :=

# build flags
ifeq ($(TARGET_OS),darwin)
    LDFLAGS += -rpath $(CUDA_PATH)/lib
    CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
    LDFLAGS += -pie
    CCFLAGS += -fpie -fpic -fexceptions
endif

ifneq ($(TARGET_ARCH),$(HOST_ARCH))
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
        ifneq ($(TARGET_FS),)
            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
            ifeq ($(GCCVERSIONLTEQ46),1)
                CCFLAGS += --sysroot=$(TARGET_FS)
            endif
            LDFLAGS += --sysroot=$(TARGET_FS)
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
        endif
    endif
    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
        NVCCFLAGS += -D_QNX_SOURCE
        NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
        CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
        LDFLAGS += -lsocket
        LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
        CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
        ifdef TARGET_OVERRIDE
            LDFLAGS += -lslog2
        endif

        ifneq ($(TARGET_FS),)
            LDFLAGS += -L$(TARGET_FS)/usr/lib
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
            LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
            CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
            CCFLAGS += -I$(TARGET_FS)/../include
        endif
    endif
endif

ifdef TARGET_OVERRIDE # cuda toolkit targets override
    NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif

# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G
      BUILD_TYPE := debug
else
      BUILD_TYPE := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)

SAMPLE_ENABLED := 1

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES :=

################################################################################

FATBIN_FILE := vectorAdd_kernel${TARGET_SIZE}.fatbin

#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
    empty :=
    space := $(empty) $(empty)
    GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
    GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
    GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
    GCCVERSION += 00
# Remove spaces from the version number
    GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))

    IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)

    ifeq ($(IS_MIN_VERSION), 1)
        $(info >>> GCC Version is greater or equal to 4.7.0 <<<)
    else
        $(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
        SAMPLE_ENABLED := 0
    endif
endif

# Gencode arguments
SMS ?= 52 60 61 70 75 80 86

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

ifeq ($(SMS),)
# Generate PTX code from SM 35
GENCODE_FLAGS += -gencode arch=compute_35,code=compute_35
endif

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

ifeq ($(TARGET_OS),darwin)
  ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
  ifeq ($(TARGET_ARCH),x86_64)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
    CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
  endif

  ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
    ifdef TARGET_OVERRIDE
        CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
    endif
  endif

  ifeq ($(TARGET_ARCH),ppc64le)
    CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
  endif

  ifeq ($(HOST_ARCH),ppc64le)
    CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
  endif

  CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
  ifeq ("$(CUDALIB)","")
    $(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed.  Please re-install the driver. <<<)
    SAMPLE_ENABLED := 0
  else
    CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
    LIBRARIES += -L$(CUDALIB) -lcuda
  endif
endif

ALL_CCFLAGS += --threads 0 --std=c++11

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

# Target rules
all: build

build: vectorAddDrv $(FATBIN_FILE)

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
        @echo "Sample will be waived due to the above missing dependencies"
else
        @echo "Sample is ready - all dependencies have been met"
endif

$(FATBIN_FILE): vectorAdd_kernel.cu
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -rdc=true -lcudadevrt -dlink  -o $@ -fatbin $<

vectorAddDrv.o:vectorAddDrv.cpp
        $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

vectorAddDrv: vectorAddDrv.o
        $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)

run: build
        $(EXEC) ./vectorAddDrv

clean:
        rm -f vectorAddDrv vectorAddDrv.o  $(FATBIN_FILE)

clobber: clean
$ make clean
>>> GCC Version is greater or equal to 4.7.0 <<<
rm -f vectorAddDrv vectorAddDrv.o  vectorAdd_kernel64.fatbin
$ make
>>> GCC Version is greater or equal to 4.7.0 <<<
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv.o -c vectorAddDrv.cpp
/usr/local/cuda/bin/nvcc -ccbin g++   -m64      -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -o vectorAddDrv vectorAddDrv.o  -L/usr/local/cuda/lib64/stubs -lcuda
/usr/local/cuda/bin/nvcc -ccbin g++ -I/usr/local/cuda/samples/common/inc  -m64    --threads 0 --std=c++11 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -rdc=true -lcudadevrt -dlink  -o vectorAdd_kernel64.fatbin -fatbin vectorAdd_kernel.cu
$ cuda-memcheck ./vectorAddDrv
========= CUDA-MEMCHECK
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Tesla V100-PCIE-32GB
> findModulePath found file at <./vectorAdd_kernel64.fatbin>
> initCUDA loading module: <./vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1
========= ERROR SUMMARY: 0 errors
$

On Windows/VS 2019/CUDA 11.1, I followed these steps:

Open the vectorAddDrv project/solution, on my machine it was in: C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv
Replace the code in the vectorAddDrv.cpp file with the code from the same file above in the linux example.
Replace the code in the vectorAdd_kernel.cu file with the code from the same file above in the linux example.
In the solution explorer pane on the left, right-click on the vectorAdd_kernel.cu file, and open the properties. Then in Configuration Properties...CUDA C/C++...Common change "Generate Relocatable Device Code" from No to Yes. Then in Configuration Properties...CUDA C/C++...Command Line add -dlink. Also make sure that Configuration Properties...CUDA C/C++...Device...Code Generation matches the device(s) you want to run on. Click "OK".
In the same solution explorer pane on the left, right click on the vectorAddDrv project, select Properties, then in Configuration Properties...CUDA Linker...General change "Perform Device Link" from Yes to No. Click "OK".
Select Build...Rebuild Solution.

When I do that I see console build output like this:

1>------ Rebuild All started: Project: vectorAddDrv, Configuration: Debug x64 ------
1>Compiling CUDA source file vectorAdd_kernel.cu...
1>
1>C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\bin\nvcc.exe" -gencode=arch=compute_61,code=sm_61 --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu -rdc=true  -I./ -I../../common/inc -I./ -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\/include" -I../../common/inc -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.1\include"  -G   --keep-dir x64\Debug -maxrregcount=0  --machine 64 -fatbin -cudart static -dlink -Xcompiler "/wd 4819" -o data/vectorAdd_kernel64.fatbin "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\vectorAdd_kernel.cu"
1>vectorAdd_kernel.cu
1>vectorAddDrv.cpp
1>LINK : ..\..\bin\win64\Debug\\vectorAddDrv.exe not found or not built by the last incremental link; performing full link
1>vectorAddDrv_vs2019.vcxproj -> C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\0_Simple\vectorAddDrv\../../bin/win64/Debug/vectorAddDrv.exe
========== Rebuild All: 1 succeeded, 0 failed, 0 skipped ==========

If we then open a command prompt and navigate to the indicated location for the executable, and run it, I see:

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>vectorAddDrv
Linked CDP demo (Driver API)
> Using CUDA Device [0]: Quadro P4000
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ./
...
sdkFindFilePath <vectorAdd_kernel64.fatbin> in ../../../0_Simple/vectorAddDrv/data/
> findModulePath found file at <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
> initCUDA loading module: <../../../0_Simple/vectorAddDrv/data/vectorAdd_kernel64.fatbin>
kernel level 4
kernel level 3
kernel level 2
kernel level 1

C:\ProgramData\NVIDIA Corporation\CUDA Samples\v11.1\bin\win64\Debug>

One of the notes in the other answer applies here as well: The extended sequence of sdkFindFilePath messages printed at runtime can be shortened by copying the fatbin file from its location to the location of the exe file. The final sdkFindFilePath output will tell you where it found the fatbin file.

Error compiling CUDA dynamic parallelism code with Driver API

2 Answers2

Linked