I am trying to implement KNN (k nearest neighbours) to each coordinate of x, y using CUDA, it works if I run the project on up to 7000 coordinates, after that it just prints junk, is there a way to make sure CUDA finishes its calculations before calling the memcpy
function?
main:
runCuda(cudaAllCordDevice, cudaKNearestCord, numOfJobsPerSlave, knn, size, max, maxThreadNumber, numOfJobsForCuda);
if (cudaDeviceSynchronize() == cudaSuccess){
printf("cuda success");
}else
{
printf("cuda fail");
}
cudaFinishedCalculation = (Coordinate*)malloc((knn+1)*sizeof(Coordinate)*numOfJobsForCuda);
cudaMemcpy(cudaFinishedCalculation, cudaKNearestCord, numOfJobsForCuda*((knn+1)*sizeof(Coordinate)), cudaMemcpyDeviceToHost);
and my kernel (cu):
__global__ void calcNCoordinates(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int cudaSizeToCalc)
{
int i, j ,index;
Coordinate* cudaTempKNearest = (Coordinate*)malloc((knn+1)*sizeof(Coordinate));
int threadId = threadIdx.x; //get current thread Id
index = threadId+startIndex;
while(threadId < cudaSizeToCalc) //while treadId is smaller then the size to calc for cuda
{
calcKnnPerCoodinate(cudaAllCoordArr, cudaTempKNearest, knn, size, index, max);
for(j = 0 ; j < knn+1 ; j++)
{
cudaKNearest[j+threadId*(knn+1)] = cudaTempKNearest[j];
}
threadId += blockDim.x; ////number of threads running in a block - 1024 in afeka pc
index = threadId+startIndex; //update the current index
}
free(cudaTempKNearest);
}
void runCuda(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int maxNumberOfThreads, int numOfJobsForCuda)
{
calcNCoordinates<<<1, maxNumberOfThreads>>>(cudaAllCoordArr, cudaKNearest, startIndex, knn, size, max, numOfJobsForCuda);
}
If I try running it with up to 7k coordinates it prints success and otherwise prints fail, I have tried cudaSyncronized()
. I need the program to continue calculating until it finishes.