I'm implementing k-means, on GPU and for now i have the folowing code:
__device__ unsigned int cuda_delta = 0;
__global__ void kmeans_kernel(const sequence_t *data,
const sequence_t *centroids,
int * membership,
unsigned int n,
unsigned int numClusters )
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < n){
int min_distance = INT_MAX;
int nearest = -1;
for (int i = 0; i < numClusters; i++){
sequence_t centroid = centroids[i];
int distance = distance(centroid, data[index]);
if(distance < min_distance) {
nearest = i;
min_distance = distance;
}
}
if(membership[index] != nearest) {
membership[index]=nearest;
atomicInc(&cuda_delta,n);
}
}
As you can see, there is no data dependency on the algorithm, only in the variable cuda_delta, stored on global memory. According com the documentation:
An atomic function performs a read-modify-write atomic operation on one 32-bit or 64-bit word residing in global or shared memory
It is exactly what i need. Edit - here is all my host code
unsigned int delta=0; //Number of objects has diverged in current iteration
label = (int*)calloc(data_size,sizeof(int));
centroids = (sequence_t*)calloc(clusters,sizeof(sequence_t));
// cuda variables
sequence_t * cuda_data = NULL;
sequence_t * cuda_centroids = NULL;
int *cuda_membership = NULL;
unsigned int *cuda_tmp_centroidCount = NULL;
const unsigned int threadsPerBlock = 1024;
const unsigned int numBlocks = (data_size + threadsPerBlock - 1) / threadsPerBlock;
const unsigned int numBlocks2 = (clusters + threadsPerBlock - 1) / threadsPerBlock;
for(unsigned int i = 0;i < clusters;i++) {
int h = i * data_size / clusters;
centroids[i] = make_ulong3(data[h].x,data[h].y,data[h].z);
}
memset (label,-1,data_size * sizeof(int));
checkCuda(cudaMalloc(&cuda_data, data_size * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_centroids, clusters * sizeof(sequence_t)));
checkCuda(cudaMalloc(&cuda_membership, data_size * sizeof(int)));
checkCuda(cudaMalloc(&cuda_tmp_centroidCount, clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int)));
checkCuda(cudaMemcpy(cuda_data,data, data_size *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_centroids, centroids, clusters *sizeof(sequence_t) , cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(cuda_membership, label, clusters *sizeof(int) , cudaMemcpyHostToDevice));
int pc = 0;
do {
cudaMemset (cuda_tmp_centroidCount,0,clusters * BIT_SIZE_OF(sequence_t) *sizeof(unsigned int));
delta = 0;
checkCuda(cudaMemcpyToSymbol(cuda_delta, &delta,sizeof(unsigned int),0,cudaMemcpyHostToDevice));
kmeans_kernel <<< numBlocks,threadsPerBlock>>>(cuda_data,
cuda_centroids,
cuda_membership,
data_size,
clusters);
cudaDeviceSynchronize();
checkCuda(cudaMemcpyFromSymbol(&delta,cuda_delta,sizeof(unsigned int)));
printf ("%d - delta = %d\n",pc,delta);
checkCuda(cudaGetLastError());
pc++;
}
while(delta > 0);
// copy output
checkCuda(cudaMemcpy(label,cuda_membership, clusters *sizeof(int) , cudaMemcpyDeviceToHost));
checkCuda(cudaMemcpy(centroids,cuda_centroids, clusters *sizeof(sequence_t) , cudaMemcpyDeviceToHost));
// free cuda memory
checkCuda(cudaFree(cuda_data));
checkCuda(cudaFree(cuda_centroids));
checkCuda(cudaFree(cuda_membership));
checkCuda(cudaFree(cuda_tmp_centroidCount));
checkCuda(cudaDeviceReset());
The delta value printed on the first iteration changes if i run the code multiple times, and it shouldn't. Most of the time the values printed are:
0 - delta = 18630
0 - delta = 859
The expected value is 18634. Am i missing something here ?
Edit The full code is available on github, to run the example just compile using make. And run the program using the following arguments, multiple times and you will see the delta value for the first iteration is not always the expected.
./cuda-means mus_musmusculus.dat 859
Thanks in advanced!