I currently have to run a nested loop as follow:
for(int i = 0; i < N; i++){
for(int j = i+1; j <= N; j++){
compute(...)//some calculation here
}
}
I've tried leaving the first loop in CPU
and do the second loop in GPU
. Results are too many memory access
. Is there any other ways to do it? For example by thrust::reduce_by_key
?
The whole program is here:
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>
#define N 1000000
// define a 2d point pair
typedef thrust::tuple<float, float> Point;
// return a random Point in [0,1)^2
Point make_point(void)
{
static thrust::default_random_engine rng(12345);
static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
float x = dist(rng);
float y = dist(rng);
return Point(x,y);
}
struct sqrt_dis: public thrust::unary_function<Point, double>
{
float x, y;
double tmp;
sqrt_dis(float _x, float _y): x(_x), y(_y){}
__host__ __device__
float operator()(Point a)
{
tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
(thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);
tmp = -1.0*(sqrt(tmp));
return (1.0/tmp);
}
};
int main(void) {
clock_t t1, t2;
double result;
t1 = clock();
// allocate some random points in the unit square on the host
thrust::host_vector<Point> h_points(N);
thrust::generate(h_points.begin(), h_points.end(), make_point);
// transfer to device
thrust::device_vector<Point> points = h_points;
thrust::plus<double> binary_op;
float init = 0;
for(int i = 0; i < N; i++){
Point tmp_i = points[i];
float x = thrust::get<0>(tmp_i);
float y = thrust::get<1>(tmp_i);
result += thrust::transform_reduce(points.begin()+i,\
points.end(),sqrt_dis(x,y),\
init,binary_op);
std::cout<<"result"<<i<<": "<<result<<std::endl;
}
t2 = clock()-t1;
std::cout<<"result: ";
std::cout.precision(10);
std::cout<< result <<std::endl;
std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
return 0;
}