CUDA initialization error after fork

Question

I get "initialization error" after calling fork(). If I run the same program without the fork, all works fine.

if (fork() == 0) {
    ...
    cudaMalloc(....);
    ...
}

What would cause this?

A complete example is below. If I comment out the cudaGetDeviceCount call, it works fine.

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>

#define PERR(call) \
  if (call) {\
   fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
      cudaGetErrorString(cudaGetLastError()));\
   exit(1);\
  }

int
main(int argc, char **argv)
{
  float *v_d;
  int gpucount;

  cudaGetDeviceCount(&gpucount);

  if (fork() == 0) {
    cudaSetDevice(0);
    PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
  }
  wait(NULL);
  return 0;
}

Simple Makefile:

PROGS = fork
CUDA_PATH = /usr/local/cuda
CXXFLAGS = -g -O0 -Wall
CXXINCLUDES = -I$(CUDA_PATH)/include
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(CXX) -Xcompiler "$(CXXFLAGS)"

fork: fork.cxx
        $(NVCC) $^ -o $@ $(LIBS)

clean:
        (rm $(PROGS) *.o)

In this case, I am only trying to get the number of devices available from within the parent process. This work-around does it:

  if (fork() == 0) {
    PERR(cudaGetDeviceCount(&gpucount));
    return(gpucount);
  }
  wait(&gpucount);
  gpucount =  WEXITSTATUS(gpucount);

Can you provide a complete code demonstrating what you are doing? I didn't have any trouble with a `fork` and a `cudaMalloc`. — Robert Crovella, Apr 08 '14 at 23:32
I think I have a clue now. The program was calling cudaSetDevice before the fork. If I move the call inside the fork, it runs. I'll put together a small example. — Bob, Apr 08 '14 at 23:35

score 17 · Accepted Answer · edited May 23 '17 at 10:33

fork() creates a child process. Processes have their own address spaces. A CUDA context cannot be shared between two different processes, for many reasons, one of which is that various pointers would be meaningless in a different address space.

If you create a CUDA context before the fork(), you cannot use that within the child process. The cudaSetDevice(0); call attempts to share the CUDA context, implicitly created in the parent process when you call cudaGetDeviceCount();

The solution, as you've hinted at, is either to do your CUDA work in the parent process or in the child process. If you are in a multi-device system, it should be possible to allocate separate devices to separate processes (the CUDA simpleIPC sample code does exactly this). (The key point is to not create a CUDA context before the fork.)

You may be interested in this question/answer and this one.

Here's a fully worked example (requires 2 CUDA devices) showing a child process and a parent process using separate GPUs:

$ cat t345.cu
#include <unistd.h>     /* Symbolic Constants */
#include <sys/types.h>  /* Primitive System Data Types */
#include <errno.h>      /* Errors */
#include <stdio.h>      /* Input/Output */
#include <sys/wait.h>   /* Wait for Process Termination */
#include <stdlib.h>     /* General Utilities */


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void addkernel(int *data){
  *data += 1;
}

int main()
{
    pid_t childpid; /* variable to store the child's pid */
    int retval;     /* child process: user-provided return code */
    int status;     /* parent process: child's exit status */

    /* only 1 int variable is needed because each process would have its
       own instance of the variable
       here, 2 int variables are used for clarity */

    /* now create new process */
    childpid = fork();

    if (childpid >= 0) /* fork succeeded */
    {
        if (childpid == 0) /* fork() returns 0 to the child process */
        {
            printf("CHILD: I am the child process!\n");
            printf("CHILD: Here's my PID: %d\n", getpid());
            printf("CHILD: My parent's PID is: %d\n", getppid());
            printf("CHILD: The value of my copy of childpid is: %d\n", childpid);
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(0);
            cudaCheckErrors("CHILD cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 1;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("CHILD: result: %d\n", *h_a);

            printf("CHILD: Sleeping for 1 second...\n");
            sleep(1); /* sleep for 1 second */
            cudaDeviceReset();
            printf("CHILD: Enter an exit value (0 to 255): ");
            scanf(" %d", &retval);
            printf("CHILD: Goodbye!\n");
            exit(retval); /* child exits with user-provided return code */
        }
        else /* fork() returns new pid to the parent process */
        {
            printf("PARENT: I am the parent process!\n");
            printf("PARENT: Here's my PID: %d\n", getpid());
            printf("PARENT: The value of my copy of childpid is %d\n", childpid);
            printf("PARENT: I will now wait for my child to exit.\n");
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(1);
            cudaCheckErrors("PARENT cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 2;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result: %d\n", *h_a);
            wait(&status); /* wait for child to exit, and store its status */
            printf("PARENT: Child's exit code is: %d\n", WEXITSTATUS(status));
            cudaSetDevice(0);
            cudaCheckErrors("PARENT cudaSetDevice  2 fail");
            int *h_a2, *d_a2;
            cudaMalloc(&d_a2, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            h_a2 = (int *)malloc(sizeof(int));
            *h_a2 = 5;
            cudaMemcpy(d_a2, h_a2, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a2);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a2, d_a2, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result2: %d\n", *h_a2);
            printf("PARENT: Goodbye!\n");
            exit(0);  /* parent exits */
        }
    }
    else /* fork returns -1 on failure */
    {
        perror("fork"); /* display error message */
        exit(0);
    }
}
$ nvcc -arch=sm_20 -o t345 t345.cu
$ ./t345
CHILD: I am the child process!
CHILD: Here's my PID: 23603
CHILD: My parent's PID is: 23602
CHILD: The value of my copy of childpid is: 0
PARENT: I am the parent process!
PARENT: Here's my PID: 23602
PARENT: The value of my copy of childpid is 23603
PARENT: I will now wait for my child to exit.
CHILD: result: 2
CHILD: Sleeping for 1 second...
PARENT: result: 3
CHILD: Enter an exit value (0 to 255): 10
CHILD: Goodbye!
PARENT: Child's exit code is: 10
PARENT: result2: 6
PARENT: Goodbye!
$

(modified from here)

I am disappointed that it is not mentioned in the "Toolkit Documentation". The example "0_Simple/simpleIPC/simpleIPC.cu" explains how to work-around this. I have also added a simple work-around for my use case into the original question. There is another misleading call in the reference "cudaDeviceReset();" which does not seem to do anything to reset the underlying context. — Bob, Apr 09 '14 at 00:28
But what if you need to call `cudaGetDeviceCount` before the fork? If I'm using multiple GPUs and using one process per GPU, I want to know how many processes to spawn ahead of time. — landau, Jun 21 '15 at 11:02
It's probably better to use one thread per GPU than one process per GPU. If you want to use processes, then create processes for the maximum number of GPUs that your program can handle (this is probably no larger than 8). Each process then has its own unique ID, and can query `cudaGetDeviceCount`. If no GPU exists to correspond to that process, then the process just exits. For example, if you have 4 GPUs, but spin up 8 processes, then processes 0-3 would each get a GPU and processes 4-7 would see that there are only 4 GPUs and so would just exit. I'm sure there are other approaches as well. — Robert Crovella, Jun 21 '15 at 14:41
For another approach, take a look at the `simpleIPC` [cuda sample code](http://docs.nvidia.com/cuda/cuda-samples/index.html#simpleipc). It spins up a worker process to query device count, then lets that process exit, which destroys its context. The device count is communicated back to the original process, which then uses that to spawn one process per GPU. — Robert Crovella, Jun 21 '15 at 15:17

CUDA initialization error after fork

1 Answers1

Linked