Passing a function pointer and its parameters as a thrust::tuple to a global function
Something like this should be workable:
$ cat t1161.cu
#include <thrust/tuple.h>
#include <stdio.h>
template <typename T, typename T1>
__global__ void kernel(void (*fp)(T1), T params){ // "someFunction"
fp(thrust::get<0>(params));
fp(thrust::get<1>(params));
}
__device__ void df(int n){ // "otherFunction"
printf("parameter = %d\n", n);
}
__device__ void (*ddf)(int) = df;
int main(){
void (*hdf)(int);
thrust::tuple<int, int> my_tuple = thrust::make_tuple(1,2);
cudaMemcpyFromSymbol(&hdf, ddf, sizeof(void *));
kernel<<<1,1>>>(hdf, my_tuple);
cudaDeviceSynchronize();
}
$ nvcc -o t1161 t1161.cu
$ cuda-memcheck ./t1161
========= CUDA-MEMCHECK
parameter = 1
parameter = 2
========= ERROR SUMMARY: 0 errors
$
A similar methodology should also be workable if you intend df
to be a __global__
function, you will just need to account properly for the dynamic parallelism case. Likewise, only a slight variation on above should allow you to pass the tuple directly to the child function (i.e. df
, whether device function or kernel). It's not clear to me why you need variadic template arguments if your parameters are nicely packaged up in a thrust tuple.
EDIT: If you can pass your tuple to the child kernel (I don't see why you wouldn't be able to, since according to your updated example the tuple and the child kernel share the same variadic parameter pack), then you may still be able to avoid variadic templates using this approach:
$ cat t1162.cu
#include <thrust/tuple.h>
#include <stdio.h>
template<typename T>
__global__ void someFunction(void (*fp)(T), T params) {
fp<<<1,1>>>(params);
cudaDeviceSynchronize();
}
__global__ void otherFunction(thrust::tuple<int> t) {
printf("param 0 = %d\n", thrust::get<0>(t));
}
__global__ void otherFunction2(thrust::tuple<float, float> t) {
printf("param 1 = %f\n", thrust::get<1>(t));
}
__device__ void (*kfp)(thrust::tuple<int>) = &otherFunction;
__device__ void (*kfp2)(thrust::tuple<float, float>) = &otherFunction2;
int main(int argc, char **argv) {
void (*h_kfp)(thrust::tuple<int>);
void (*h_kfp2)(thrust::tuple<float, float>);
cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1));
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&h_kfp2, kfp2, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<<<1,1>>>(h_kfp2, thrust::make_tuple(0.5f, 1.5f));
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t1162 t1162.cu -lcudadevrt
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t1162
========= CUDA-MEMCHECK
param 0 = 1
param 1 = 1.500000
========= ERROR SUMMARY: 0 errors
$
In terms of functionality (being able to dispatch multiple child kernels with varying parameter packs) I don't see any difference in capability, again assuming your parameters are nicely packaged in a tuple.