I am a fairly new cuda user. I'm practicing on my first cuda application where I try to accelerate kmeans algorithm by using GPU(GTX 670).
Briefly, each thread works on a single point which is compared to all cluster centers and a point is assigned to a center with minimum distance(kernel code can be seen below with comments).
According to Nsight Visual Studio, I have an occupancy of 99.61%(1024 blocks, 1024 threads per block), 99.34% Streaming Multiprocessor activity, 79.98% warp issue efficiency, no shared memory bank conflicts, 18.4GFLOPs Single MUL and 55.2 GFLOPs Single ADD(takes about 14,5 ms to complete kmeans kernel with given parameters).
According to Wikipedia, GTX670's peak performance is 2460 GFLOPs. I am nowhere close to it. In addition to these, some papers claim they can achieve more than half of the peak performance. I cannot see how further I can optimize this kernel code. Is there any optimization that I can apply to the kernel? Any suggestion or help is appreciated and I can give any additional information on demand.
Thanks in advance.
#define SIZE 1024*1024 //number of points
#define CENTERS 32 //number of cluster centroids
#define DIM 8 //dimension of each point and center
#define cudaTHREADSIZE 1024 //threads per block
#define cudaBLOCKSIZE SIZE/cudaTHREADSIZE //number of blocks for kernel
__global__ void kMeans(float *dp, float *dc,int *tag, int *membershipChangedPerBlock)
{
//TOTAL NUMBER OF THREADS SHOULD BE EQUAL TO THE NUMBER OF POINTS, BECAUSE EACH THREAD WORKS ON A SINGLE POINT
__shared__ unsigned char membershipChanged[cudaTHREADSIZE];
__shared__ float dc_shared[CENTERS*DIM];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int threadID = threadIdx.x;
membershipChanged[threadIdx.x] = 0;
//move centers to shared memory, because each and every thread will call it(roughly + %10 performance here)
while(threadID < CENTERS*DIM){
dc_shared[threadID] = dc[threadID];
threadID += blockDim.x;
}
__syncthreads();
while(tid < SIZE){
int index,prevIndex;
float dist, min_dist;
index = 0;//all initial point indices(centroid number) are assigned to 0.
prevIndex = 0;
dist = 0;
min_dist = 0;
//euclid distance for center 0
for(int dimIdx = 0; dimIdx < DIM; dimIdx++){
min_dist += (dp[tid + dimIdx*SIZE] - dc_shared[dimIdx*CENTERS])*(dp[tid + dimIdx*SIZE] - dc_shared[dimIdx*CENTERS]);
}
//euclid distance for other centers with distance comparison
for(int centerIdx = 1; centerIdx < CENTERS; centerIdx++){
dist = 0;
for(int dimIdx = 0; dimIdx < DIM; dimIdx++){
dist += (dp[tid + dimIdx*SIZE] - dc_shared[centerIdx + dimIdx*CENTERS])*(dp[tid + dimIdx*SIZE] - dc_shared[centerIdx + dimIdx*CENTERS]);
}
//compare distances, if found a shorter one, change index to that centroid number
if(dist < min_dist){
min_dist = dist;
index = centerIdx;
}
}
if (tag[tid] != index) {//if a point's cluster membership changes, flag it as changed in order to compute total membership changes later on
membershipChanged[threadIdx.x] = 1;
}
tag[tid] = index;
__syncthreads();//sync before applying sum reduction to membership changes
//sum reduction
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (threadIdx.x < s) {
membershipChanged[threadIdx.x] +=
membershipChanged[threadIdx.x + s];
}
__syncthreads();
}
if (threadIdx.x == 0) {
membershipChangedPerBlock[blockIdx.x] = membershipChanged[0];
}
tid += blockDim.x * gridDim.x;
}
}