Yes, you can do that with thrust.
You can pass device pointers to thrust, and thrust will do the right thing if you specify explicitly the device execution path, using thrust execution policies.
Alternatively, you can use thrust::device_ptr
to refer to your data, and thrust will also do the right thing, even without explicitly specifying the device execution path.
This answer covers both approaches, albeit with inclusive_scan
.
Here is an example:
$ cat t137.cu
#include <thrust/reduce.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <iostream>
__global__ void k(int *d, int n){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n)
d[idx] = idx;
}
const int ds = 10;
const int nTPB = 256;
int main(){
int *d, r1, r2;
cudaMalloc(&d, ds*sizeof(d[0]));
k<<<(ds+nTPB-1)/nTPB,nTPB>>>(d, ds);
thrust::device_ptr<int> tdp = thrust::device_pointer_cast(d);
r1 = thrust::reduce(tdp, tdp+ds);
r2 = thrust::reduce(thrust::device, d, d+ds);
std::cout << "r1: " << r1 << " r2: " << r2 << std::endl;
}
$ nvcc -std=c++14 -o t137 t137.cu
$ ./t137
r1: 45 r2: 45
$