#include <iostream>
#include <math.h>
#include <vector>
#include <assert.h>
#include <fstream>
#include <map>
#include <algorithm>
#include <sstream>
#include <cuda_runtime_api.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/functional.h>
#include <thrust/execution_policy.h>
#include <cub/cub.cuh>
using namespace std;
typedef float real;
int MAX_N = 10000000;
int N;
real* a, *b;
real* d_a;
real* h_res1, *h_res2;
volatile real v_res = 0;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
void genData()
{
N = 100000;
for (int i = 0; i < N; i++) a[i] = float(rand() % 1000) / (rand() % 1000 + 1);
}
void __attribute__((noinline)) testCpu(real* arr, real* res, int N)
{
std::sort(arr, arr + N);
v_res = arr[rand() % N];
memcpy(res, arr, N * sizeof(real));
}
__global__
void sort_kernel(float* a, int N)
{
if (blockIdx.x==0 && threadIdx.x==0)
thrust::sort(thrust::device, a, a + N);
__syncthreads();
}
void __attribute__((noinline)) testGpu(real* arr, real* res, int N)
{
MyTimer timer;
timer.startCounter();
cudaMemcpy(d_a, arr, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
cout << "Copy H2D cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
//thrust::sort(thrust::device, d_a, d_a + N);
sort_kernel<<<1,1>>>(d_a, N);
cudaDeviceSynchronize();
cout << "Thrust sort cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
cudaMemcpy(res, d_a, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cout << "Copy D2H cost = " << timer.getCounterMsPrecise() << "\n";
v_res = res[rand() % N];
}
void __attribute__((noinline)) deepCopy(real* a, real* b, int N)
{
for (int i = 0; i < N; i++) b[i] = a[i];
}
void testOne(int t, bool record = true)
{
MyTimer timer;
genData();
deepCopy(a, b, N);
timer.startCounter();
testCpu(a, h_res1, N);
cout << "CPU cost = " << timer.getCounterMsPrecise() << "\n";
timer.startCounter();
testGpu(b, h_res2, N);
cout << "GPU cost = " << timer.getCounterMsPrecise() << "\n";
for (int i = 0; i < N; i++) {
if (h_res1[i] != h_res2[i]) {
cout << "ERROR " << i << " " << h_res1[i] << " " << h_res2[i] << "\n";
exit(1);
}
}
cout << "-----------------\n";
}
int main()
{
a = new real[MAX_N];
b = new real[MAX_N];
cudaMalloc(&d_a, MAX_N * sizeof(float));
cudaMallocHost(&h_res1, MAX_N * sizeof(float));
cudaMallocHost(&h_res2, MAX_N * sizeof(float));
testOne(0, 0);
for (int i = 1; i <= 50; i++) testOne(i);
}
For legacy code reason, I have to perform sort inside a kernel completely. Basically, I need:
__global__ void mainKernel(float** a, int N, float* global_pad)
{
int x;
...
cooperative_groups::grid_group g = cooperative_groups::this_grid();
sortFunc(a[x], N); // this can be a kernel. Then only 1 thread in the grid will call it
g.sync();
...
}
I tried to use thrust::sort
but it's extremely slow. For example, with N = 100000
, the benchmark result is:
CPU cost = 5.82228
Copy H2D cost = 0.088908
Thrust sort from CPU cost = 0.391211 (running line thrust::sort(thrust::device, d_a, d_a + N);)
Thrust sort inside kernel cost = 116 (running line sort_kernel<<<1,1>>>(d_a, N);)
Copy D2H cost = 0.067639
Why is thrust::sort
so slow in this case? I want to find an implementation of sortFunc
that is fastest possible (global_pad
can be used as temporary memory)
Edit: I'm using 2080ti and CUDA 11.4. The compile command I use is
nvcc -o main main.cu -O3 -std=c++17