I'm currently working on a GPU rendering algorithm in which I need to sort an array of this struct:
struct RadiosityData {
vec4 emission;
vec4 radiosity;
float nPixLight;
float nPixCam;
float __padding[2];
};
I am using the following code to sort the array:
thrust::device_ptr<RadiosityData> dev_ptr = thrust::device_pointer_cast(GPUpointer_ssbo);
thrust::sort(dev_ptr, dev_ptr + N);
where GPUpointer_ssbo is a GPU pointer coming from cudaOpenGL interop, N is equal to ~300k. The comparison is done with:
__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };
The sorting is very slow on my GTX960M: without sorting, my aplication is doing ~10ms per frame, while with sorting it is taking around 35ms. This means the sorting is taking ~25ms. I am measuring the exec time with VS-NSIGHT
I am aware that this problem can be a GPU sync problem since I am doing OpenGL operations prior to calling thrust. Nevertheless, I am not convinced by this argument, because if I use the unsorted array to display data with OpenGL, it still takes 10ms total, which means that there is no sync problems with the OpenGL code itself.
Is this performance expected for such "small" array? Is there a better GPU sorting algorithm available for this kind of problem?
------------EDIT: I'm compiling in release with the default VS2019 CUDA command, which is:
Driver API (NVCC Compilation Type is .cubin, .gpu, or .ptx) set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\10\" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nvcc.exe" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -o x64\Release\sortBufferCUDA.cu.obj "C:\Users\Jose\Desktop\RealTimeDiffuseIlumination\OpenGL-avanzado\sortBufferCUDA.cu"
Runtime API (NVCC Compilation Type is hybrid object or .c file) set CUDAFE_FLAGS=--sdk_dir "C:\Program Files (x86)\Windows Kits\10\" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin\nvcc.exe" --use-local-env -ccbin "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.26.28801\bin\HostX86\x64" -x cu --keep-dir x64\Release -maxrregcount=0 --machine 64 --compile -cudart static -Xcompiler "/EHsc /nologo /Fd /FS /Zi " -o x64\Release\sortBufferCUDA.cu.obj "C:\Users\Jose\Desktop\RealTimeDiffuseIlumination\OpenGL-avanzado\sortBufferCUDA.cu"
--------------EDIT 2:
The following is a minimal working example:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <thrust/device_vector.h>
struct RadiosityData {
float emission[4];
float radiosity[4];
float nPixLight;
float nPixCam;
float __padding[2];
};
extern "C" void CUDAsort();
__host__ __device__ bool operator<(const RadiosityData& lhs, const RadiosityData& rhs) { return (lhs.nPixCam > rhs.nPixCam); };
int pri = 1;
thrust::device_vector<RadiosityData> dev;
void CUDAsort() {
if (pri == 1) {
pri = 0;
dev.resize(300000);
}
thrust::sort(dev.begin(), dev.end());
}
int main()
{
float time;
cudaEvent_t start, stop;
while (true) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
CUDAsort();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Time to generate: %3.1f ms \n", time);
}
return 0;
}