As already suggested in the comments, this kernel may not be the performance limiter that you think it is. At least, you've offered no data supporting that idea. However some suggestions can still be made which should improve the runtime of this kernel.
I'm going to assume GLfloat
is equivalent to float
. In that case, especially since the primary output of this kernel (device_particleCoordinates
) are float
quantities, its doubtful that any intermediate calculations that are done in double
precision are providing much benefit. You could try converting all operations to float
arithmetic.
Division in GPU code can be expensive. Division by a constant can be replaced by multiplication by the reciprocal of the constant, for floating point operations.
Your load and store operations are loading alternate locations. The efficiency can be improved with a vector load/store. As indicated in the comment, this makes an assumption about alignment of the underlying data.
Here's an example modified kernel (untested) demonstrating these ideas:
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
According to my testing, these changes will reduce kernel execution time from about 69us (updateParticle
) to about 54us (updateParticle1
) for ~1 million particles:
$ cat t388.cu
#include <GL/gl.h>
const int ppt = 4;
__global__
void updateParticle(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
int particleLifetime = (time - device_particleCreatedTime[threadId]) / 1000;
double distanceX = 0.5 * device_particleAcceleration[threadId * 2 + 0] * (particleLifetime * particleLifetime) / 5000.0;
double distanceY = 0.5 * device_particleAcceleration[threadId * 2 + 1] * (particleLifetime * particleLifetime) / 5000.0;
device_particleCoordinates[threadId * 2 + 0] = device_particleStartCoordinates[threadId * 2 + 0] + distanceX;
device_particleCoordinates[threadId * 2 + 1] = device_particleStartCoordinates[threadId * 2 + 1] + distanceY;
}
}
__global__
void updateParticle1(const int num_particles, const double time, const double gravity,
GLfloat* device_particleCoordinates, GLfloat* device_particleStartCoordinates,
GLfloat* device_particleAcceleration, GLint* device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle2(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
if (threadId < num_particles)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
}
}
__global__
void updateParticle3(const int num_particles, const double time, const double gravity,
GLfloat * __restrict__ device_particleCoordinates, const GLfloat * __restrict__ device_particleStartCoordinates,
const GLfloat * __restrict__ device_particleAcceleration, const GLint * __restrict__ device_particleCreatedTime)
{
int threadId = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = 0; i < ppt; i++)
{
float particleLifetime = (int)((((float)time) - (float)device_particleCreatedTime[threadId]) * (0.001f));
float2 dpA = *(reinterpret_cast<const float2 *>(device_particleAcceleration)+threadId);
float spl2 = 0.0001f * particleLifetime*particleLifetime;
float distanceX = dpA.x * spl2;
float distanceY = dpA.y * spl2;
float2 dpC = *(reinterpret_cast<const float2 *>(device_particleStartCoordinates)+threadId);
dpC.x += distanceX;
dpC.y += distanceY;
*(reinterpret_cast<float2 *>(device_particleCoordinates)+threadId) = dpC;
threadId += gridDim.x*blockDim.x;
}
}
int main(){
int num_p = 1048576;
float *dpC, *dpSC, *dpA;
int *dpCT;
cudaMalloc(&dpC, num_p*2*sizeof(dpC[0]));
cudaMalloc(&dpSC, num_p*2*sizeof(dpSC[0]));
cudaMalloc(&dpA, num_p*2*sizeof(dpA[0]));
cudaMalloc(&dpCT, num_p*sizeof(dpCT[0]));
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle1<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle2<<<(num_p+255)/256, 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
updateParticle3<<<num_p/(ppt*256), 256>>>(num_p, 1.0, 1.0, dpC, dpSC, dpA, dpCT);
cudaDeviceSynchronize();
}
$ nvcc -arch=sm_60 -o t388 t388.cu
$ nvprof ./t388
==32419== NVPROF is profiling process 32419, command: ./t388
==32419== Profiling application: ./t388
==32419== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 30.11% 141.41us 2 70.703us 68.991us 72.416us updateParticle(int, double, double, float*, float*, float*, int*)
23.53% 110.50us 2 55.247us 54.816us 55.679us updateParticle2(int, double, double, float*, float const *, float const *, int const *)
23.31% 109.47us 2 54.735us 54.335us 55.136us updateParticle3(int, double, double, float*, float const *, float const *, int const *)
23.06% 108.29us 2 54.144us 53.952us 54.336us updateParticle1(int, double, double, float*, float*, float*, int*)
API calls: 97.56% 291.86ms 4 72.966ms 273.40us 291.01ms cudaMalloc
1.53% 4.5808ms 384 11.929us 313ns 520.98us cuDeviceGetAttribute
0.49% 1.4735ms 4 368.37us 226.07us 580.91us cuDeviceTotalMem
0.22% 670.21us 4 167.55us 89.800us 369.11us cuDeviceGetName
0.13% 392.94us 1 392.94us 392.94us 392.94us cudaDeviceSynchronize
0.05% 150.44us 8 18.804us 10.502us 67.034us cudaLaunchKernel
0.01% 21.862us 4 5.4650us 4.0570us 7.0660us cuDeviceGetPCIBusId
0.00% 10.010us 8 1.2510us 512ns 2.9310us cuDeviceGet
0.00% 6.6950us 3 2.2310us 435ns 3.8940us cuDeviceGetCount
0.00% 2.3460us 4 586ns 486ns 727ns cuDeviceGetUuid
$
Decorating the pointers with const ... __restrict__
(updateParticle2
) doesn't seem to provide any additional benefit for this test case. Calculating 4 particles per thread (updateParticle3
) instead of 1 also didn't seem to have a significant impact on the processing time.
Tesla P100, CUDA 10.0, CentOS 7.5