How to make something like this work?
#define Eval(x, y, func) {y = func(x);}
__global__ void Evaluate(double *xs, double *ys, int N, double f(double))
{
int tid = threadIdx.x;
if (tid < N)
Eval(xs[tid], ys[tid], f);
}
And then inside main function I have
double *xs_d, *ys_d;
double *xs_h, *ys_h;
xs_h = (double *) malloc(sizeof(double) * 256);
ys_h = (double *) malloc(sizeof(double) * 256);
cudaMalloc((void **)&xs_d, sizeof(double) * 256);
cudaMalloc((void **)&ys_d, sizeof(double) * 256);
for (int i = 0; i < 256; i++)
{
xs_h[i] = (double)i;
}
HANDLE_ERROR(cudaMemcpy(xs_d, xs_h, 256*sizeof(double), cudaMemcpyHostToDevice));
Evaluate<<<1,256>>>(xs_d, ys_d, 256, Sin);
cudaDeviceSynchronize();
HANDLE_ERROR(cudaMemcpy(ys_h, ys_d, 256*sizeof(double), cudaMemcpyDeviceToHost));
It fails in the last line. So far I have seen solutions like this How to pass device function as an input argument to host-side function? but there they use __device__
functions, which cannot be changed or accessed by a host (main
for example) function. For example, I cannot put __device__ int *fptrf1 = (int *)f1;
(taken from the example) inside main. Is it possible to somehow have this flexibility?