1

I want to do the following:

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

void otherFunction(int n) {
}

int main(int argc, char **argv) {
    //// template argument deduction/substitution failed ////
    someFunction<int>(&otherFunction, thrust::make_tuple(1));
    return 0;
}

What I have tried:

  1. Removing one of the two parameter leads to a working solution of course.
  2. It works when I make someFunction a static function in a struct with template parameter. But in the original code someFunction is a CUDA kernel, so I can't do that. Any further ideas?
  3. It works when I change thrust::tuple to std::tuple. Is there a way to construct a thrust::tuple out of a std::tuple?

EDIT:

To make it clearer: someFunction and otherFunction are __global__!

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

__global__ void otherFunction(int n) {
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)(int);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

I get a compiler error: template argument deduction/substitution failed in both examples.

max66
  • 65,235
  • 10
  • 71
  • 111
martin
  • 225
  • 1
  • 10
  • *Maybe* unrelated to your problem, but tou might take a hint from just about [all standard algorithm functions](http://en.cppreference.com/w/cpp/algorithm) that take a "predicate" as argument. They don't really care about the arguments to the function, they just have a single `typename` template argument for the function. – Some programmer dude Jun 22 '16 at 16:05
  • 3
    If `someFunction` is a CUDA kernel (i.e. a `__global__` function), why have you not configured it in your example (at launch) or decorated it accordingly? In my view this question is pretty unclear. Is `otherFunction` intended to be callable from a `__global__` function? If so, why haven't you decorated it accordingly? You cannot take the address of a device function in host code, which appears to be what you are doing here (even if you decorated `otherFunction` with `__device__`, it still won't work as written) – Robert Crovella Jun 22 '16 at 16:08
  • The question is not about calling kernels from kernel function pointers. I leave this part out, becuase it works. It's about a compiler error when passing two arguments with variadic templates to a global function. – martin Jun 22 '16 at 16:28

2 Answers2

2

Passing a function pointer and its parameters as a thrust::tuple to a global function

Something like this should be workable:

$ cat t1161.cu
#include <thrust/tuple.h>
#include <stdio.h>

template <typename T, typename T1>
__global__ void kernel(void (*fp)(T1), T params){ // "someFunction"

  fp(thrust::get<0>(params));
  fp(thrust::get<1>(params));
}

__device__ void df(int n){                        // "otherFunction"

  printf("parameter = %d\n", n);
}

__device__ void (*ddf)(int) = df;

int main(){

  void (*hdf)(int);
  thrust::tuple<int, int> my_tuple = thrust::make_tuple(1,2);
  cudaMemcpyFromSymbol(&hdf, ddf, sizeof(void *));
  kernel<<<1,1>>>(hdf, my_tuple);
  cudaDeviceSynchronize();
}


$ nvcc -o t1161 t1161.cu
$ cuda-memcheck ./t1161
========= CUDA-MEMCHECK
parameter = 1
parameter = 2
========= ERROR SUMMARY: 0 errors
$

A similar methodology should also be workable if you intend df to be a __global__ function, you will just need to account properly for the dynamic parallelism case. Likewise, only a slight variation on above should allow you to pass the tuple directly to the child function (i.e. df, whether device function or kernel). It's not clear to me why you need variadic template arguments if your parameters are nicely packaged up in a thrust tuple.

EDIT: If you can pass your tuple to the child kernel (I don't see why you wouldn't be able to, since according to your updated example the tuple and the child kernel share the same variadic parameter pack), then you may still be able to avoid variadic templates using this approach:

$ cat t1162.cu
#include <thrust/tuple.h>
#include <stdio.h>

template<typename T>
__global__ void someFunction(void (*fp)(T), T params) {
  fp<<<1,1>>>(params);
  cudaDeviceSynchronize();
}

__global__ void otherFunction(thrust::tuple<int> t) {
  printf("param 0 = %d\n", thrust::get<0>(t));
}

__global__ void otherFunction2(thrust::tuple<float, float> t) {
  printf("param 1 = %f\n", thrust::get<1>(t));
}
__device__ void (*kfp)(thrust::tuple<int>) = &otherFunction;
__device__ void (*kfp2)(thrust::tuple<float, float>) = &otherFunction2;

int main(int argc, char **argv) {
    void (*h_kfp)(thrust::tuple<int>);
    void (*h_kfp2)(thrust::tuple<float, float>);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1));
    cudaDeviceSynchronize();
    cudaMemcpyFromSymbol(&h_kfp2, kfp2, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp2, thrust::make_tuple(0.5f, 1.5f));
    cudaDeviceSynchronize();
    return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t1162 t1162.cu -lcudadevrt
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t1162
========= CUDA-MEMCHECK
param 0 = 1
param 1 = 1.500000
========= ERROR SUMMARY: 0 errors
$

In terms of functionality (being able to dispatch multiple child kernels with varying parameter packs) I don't see any difference in capability, again assuming your parameters are nicely packaged in a tuple.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • The reason why I use variadic templates is, that I want to pass arbitrary kernel function pointers to a scheduler. The tuple contains one set of parameters for one kernel. – martin Jun 22 '16 at 16:38
  • I've added a second approach which I think addresses that concern. You can pass arbitrary kernel function pointers to a scheduling kernel, which then dispatches those kernels with the supplied parameter pack. – Robert Crovella Jun 22 '16 at 17:10
  • Thank you so much for your help! I don't want to change kernels outside the scheduler (e.g. otherFunction). I unpack the tuple to call kernels. I already have a CPU scheduler which works fine and this is the only missing part in a working GPU scheduler. Any further ideas? Thanks. – martin Jun 22 '16 at 17:28
  • I don't think sharing the parameter pack from `thrust::tuple` with `(*fp)(Args...)` is going to work. In the case of the `Args...` associated with your function, it is just `int`, but in the case of a thrust tuple with a single `int`, the `Args...` are actually `int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type`, so we get the following note after your substitution failed message: `note: inconsistent parameter pack deduction...` – Robert Crovella Jun 22 '16 at 19:53
  • Exactly! thrust::tuple is fixed size to 10, so that's the problem. As I said, it works with static functions, but not with global functions. – martin Jun 23 '16 at 04:37
0

A quick and dirty solution is to cast the function pointer:

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(), thrust::tuple<Args...> params) {
    void (*kfp)(Args...) = (void (*)(Args...)) fp;
    kfp<<<1,1>>>(thrust::get<0>(params));
}

__global__ void otherFunction(int n) {
    printf("n = %d\n", n);
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)();
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

I'm open to nicer solutions!

martin
  • 225
  • 1
  • 10
  • 1
    I had assumed you wanted to be able to dispatch kernels with arbitrary parameter sets. This can only dispatch kernels where the parameter set is known (e.g. to be `int` in the example you have shown). I don't see how that fits your problem description, but, whatever. My 2nd suggestion about passing the tuple to the child kernel avoids this limitation, so that the parent kernel need not know anything about the parameter order. – Robert Crovella Jun 22 '16 at 20:01
  • someFunction can be used to dispatch kernels with arbitrary parameter sets. `someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1.0, 1.5)`. someFunction can use parameter unpacking to support a variable length of parameters as schown here: http://stackoverflow.com/questions/7858817/unpacking-a-tuple-to-call-a-matching-function-pointer – martin Jun 23 '16 at 04:51