I'm running a Thrust parallelized binary search-type routine on an array:
// array and array2 are raw pointers to device memory
thrust::device_ptr<int> array_ptr(array);
// Search for first position where 0 could be inserted in array
// without violating the ordering
thrust::device_vector<int>::iterator iter;
iter = thrust::lower_bound(array_ptr, array_ptr+length, 0, cmp(array2));
A custom function object cmp
defines a custom comparison operator:
struct cmp
{
cmp(int *array2){ this->array2 = array2; }
__device__ bool operator()(const int& x, const int& y)
{
return device_function(array2,x) <= device_function(array2,y);
}
int *array2;
};
The comparison relies on a call to a function compiled on the device:
__device__ int device_function( const int* array2, const int value ){
int quantity = 0;
for (int i = 0; i < 50000; ++i){
if ( array2[i] > value ){ quantity += array2[i]; }
}
return quantity;
}
My question is: what (if any) parallel execution is done on the device for the sum-reduction in device_function
? If the function executes serially as such, how can I introduce parallelism to speed up the function evaluation?