I was looking to use thrust in my cuda application and I executed the following simple test to see the performance of thrust::sort
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <thrust/copy.h>
int main()
{
int min = 1;
int max = 1024*1024;
int n = 1024*1024;
thrust::host_vector<int> h_input(n);
thrust::host_vector<int> h_keysSorted(n);
//fill host input with random data
for(int i=0; i<n; i++){
h_input[i] = min + (rand() % (int)(max - min + 1));
}
thrust::device_vector<int> d_input(n);
float elapsedTime;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
d_input= h_input;
thrust::sort(d_input.begin(), d_input.end());
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
thrust::copy(d_input.begin(), d_input.end(), h_keysSorted.begin());
std::cout<<"Elapsed time: "<<elapsedTime<<std::endl;
}
Aside from the excessively long compiling and cuda context creation, the above code took just over 200ms to sort 1048576 integers on my gtx 770m. This is horrible. For example the paper indicates timings just under 2 ms for sorting arrays of the same size and I found cpu timings that took less than 200 ms.
I assume that I am doing something obviously wrong but I cant see what it is. Does anyone know why thrust is taking so long? What am I doing wrong?