CUDA: Pass device function as an argument to global function

Question

How to make something like this work?

#define Eval(x, y, func) {y = func(x);}
__global__ void Evaluate(double *xs, double *ys, int N, double f(double))
{
    int tid = threadIdx.x;
    if (tid < N)
        Eval(xs[tid], ys[tid], f);
        
    
}

And then inside main function I have

    double *xs_d, *ys_d;
    double *xs_h, *ys_h;
    xs_h = (double *) malloc(sizeof(double) * 256);
    ys_h = (double *) malloc(sizeof(double) * 256);
    cudaMalloc((void **)&xs_d, sizeof(double) * 256);
    cudaMalloc((void **)&ys_d, sizeof(double) * 256);
    for (int i = 0; i < 256; i++)
    {
        xs_h[i] = (double)i;
    }
    HANDLE_ERROR(cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice));
    Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sin);
    cudaDeviceSynchronize();
    HANDLE_ERROR(cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost));

It fails in the last line. So far I have seen solutions like this How to pass device function as an input argument to host-side function? but there they use __device__ functions, which cannot be changed or accessed by a host (main for example) function. For example, I cannot put __device__ int *fptrf1 = (int *)f1; (taken from the example) inside main. Is it possible to somehow have this flexibility?

Robert Crovella · Accepted Answer · 2021-05-27T19:10:15.207

For example, I cannot put __device__ int *fptrf1 = (int *)f1; (taken from the example) inside main. Is it possible to somehow have this flexibility?

One possible approach is to use a lambda:

$ cat t151.cu
#define Eval(x, y, func) {y = func(x);}
template <typename F>
__global__ void Evaluate(double *xs, double *ys, int N, F f)
{
    int tid = threadIdx.x;
    if (tid < N)
        Eval(xs[tid], ys[tid], f);


}

int main(){
    double *xs_d, *ys_d;
    double *xs_h, *ys_h;
    xs_h = (double *) malloc(sizeof(double) * 256);
    ys_h = (double *) malloc(sizeof(double) * 256);
    cudaMalloc((void **)&xs_d, sizeof(double) * 256);
    cudaMalloc((void **)&ys_d, sizeof(double) * 256);
    for (int i = 0; i < 256; i++)
    {
        xs_h[i] = (double)i;
    }

    cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice);
    auto Sinlambda = [] __host__ __device__ (double v) {return sin(v);};
    Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sinlambda);
    cudaDeviceSynchronize();
    cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost);
}
$ nvcc -o t151 t151.cu -std=c++11 --extended-lambda
$ cuda-memcheck ./t151
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

(CUDA 11.3)

For a variety of device function pointer uses, this answer links to a number of examples.

Thanks. I have a few follow-up questions. What does `__host__ __device__` mean? What does `[]` mean in `[] __host__ __device__` mean? Is this a pointer to the function? Instead of using the template, can I hard-code the type? What would the type be then? — Sayan, May 29 '21 at 06:29
`__host__ __device__` is used to declare to the CUDA compiler that you want this function callable from either host or device code. The `__host__` portion is unnecessary for this example, you could delete that and it would work the same way. `[ ]` syntax is part of the C++ lambda syntax. I wouldn't be able to give you a tutorial on C++ lambda functions in the space of these comments. — Robert Crovella, May 29 '21 at 14:07

CUDA: Pass device function as an argument to global function

1 Answers1