There is no magic involved in using device function pointers in device code. It is functionally and syntactically identical to standard C++.
For example:
#include <cstdio>
typedef int (*ufunc)(int args);
__device__ int f1(int x)
{
int res = 2*x;
printf("f1 arg = %d, res = %d\n", x, res);
return res;
}
__device__ int f2(int x, int y, ufunc op)
{
int res = x + op(y);
printf("f2 arg = %d, %d, res = %d\n", x, y, res);
return res;
}
__global__ void kernel(int *z)
{
int x = threadIdx.x;
int y = blockIdx.x;
int tid = threadIdx.x + blockDim.x * blockIdx.x;
z[tid] = f2(x, y, &f1);
}
int main()
{
const int nt = 4, nb = 4;
int* a_d;
cudaMalloc(&a_d, sizeof(float) * nt *nb);
kernel<<<nb, nt>>>(a_d);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
#include <cstdio>
typedef int (*bfunc)(int args);
__device__ int f1(int x)
{
int res = 2*x;
printf("f1 arg = %d, res = %d\n", x, res);
return res;
}
__device__ int f2(int x, int y, bfunc op)
{
int res = x + f1(y);
printf("f2 arg = %d, %d, res = %d\n", x, y, res);
return res;
}
__global__ void kernel(int *z)
{
int x = threadIdx.x;
int y = blockIdx.x;
int tid = threadIdx.x + blockDim.x * blockIdx.x;
z[tid] = f2(x, y, &f1);
}
int main()
{
const int nt = 4, nb = 4;
int* a_d;
cudaMalloc(&a_d, sizeof(float) * nt *nb);
kernel<<<nb, nt>>>(a_d);
cudaDeviceSynchronize();
cudaDeviceReset();
return 0;
}
Here, we define a simple pointer to a unary functor as a type, and then a device function which takes that type as an argument. The static assignment of the function pointer within the kernel call is handled at compile time and everything works. If you want to have function pointer selection happen at run time, then you need to follow the instructions given in the link you were already provided with.
The important thing to keep in mind here is that in CUDA it is not legal to include CUDA specifiers (__device__
, __constant__
, __global__
, etc) in type definitions. Each variable instance has a specifier as part of its definition.