I have N iterations and L memory locations. Each iteration is writing to any of 0 to (L-1) memory locations.
I want to store data in global memory based on iteration number. Suppose iterations K and K-1 both write to the same location. The final result in global memory should then be K. Thread K-1 should not override the result of thread K.
I have tried the solution below. I create an array with maximum iterations for each memory location and initialize to -1. Then I check if the number of iterations is more than the current value. If it is, I store the value and update the maximum iterations.
This is my current code but it is not giving correct results for larger iterations.
#include<stdio.h>
#include"cuda.h"
__global__ void fun(int *A,int *maxIndex,int *index1,int *lock)
{
int threadid=blockIdx.x*blockDim.x+threadIdx.x;
int iteration_no=threadid;
int index=index1[threadid];
int exitFromLoop=1;
while(exitFromLoop==1)
{
int flag=atomicCAS(&lock[index],0,1);
if(flag==0)
{
if(maxIndex[index]<iteration_no)
{
A[index]=89;
maxIndex[index]=iteration_no;
__threadfence();
}
else
{
}
//__threadfence();
lock[index]=0;
exitFromLoop=0;
}
else
{
}
}
}
int main()
{
int A[10]={10,20,30,40,50,60,70,80,90,100};
int maxIndex[10]={-1,-1,-1,-1,-1,-1,-1,-1,-1,-1};
int lock[10]={0,0,0,0,0,0,0,0,0,0};
int index[8192];
srand(0);
for(int ii=0;ii<8192;ii++)
{
index[ii]=rand()%10;
}
int *index1;
int *A1,*maxIndex1;
int *lock1;
cudaMalloc((void**)&lock1,sizeof(int)*10);
cudaMalloc((void**)&A1,sizeof(int)*10);
cudaMalloc((void**)&index1,sizeof(int)*8192);
cudaMalloc((void**)&maxIndex1,sizeof(int)*10);
cudaMemcpy(A1,&A,sizeof(int)*10,cudaMemcpyHostToDevice);
cudaMemcpy(lock1,&lock,sizeof(int)*10,cudaMemcpyHostToDevice);
cudaMemcpy(maxIndex1,&maxIndex,sizeof(int)*10,cudaMemcpyHostToDevice);
cudaMemcpy(index1,&index,sizeof(int)*8192,cudaMemcpyHostToDevice);
fun<<<16,512>>>(A1,maxIndex1,index1,lock1);
cudaMemcpy(&A,A1,sizeof(int)*10,cudaMemcpyDeviceToHost);
cudaMemcpy(&maxIndex,maxIndex1,sizeof(int)*10,cudaMemcpyDeviceToHost);
printf("\nindex \n");
for(int i=0;i<8192;i++)
{
printf("%d\n",index[i]);
}
for(int i=0;i<10;i++)
{
printf(" %d max is %d\n",A[i],maxIndex[i]);
}
}