2

I am trying to implement KNN (k nearest neighbours) to each coordinate of x, y using CUDA, it works if I run the project on up to 7000 coordinates, after that it just prints junk, is there a way to make sure CUDA finishes its calculations before calling the memcpy function?

main:

runCuda(cudaAllCordDevice, cudaKNearestCord, numOfJobsPerSlave, knn, size, max,  maxThreadNumber, numOfJobsForCuda);
        if (cudaDeviceSynchronize() == cudaSuccess){
            printf("cuda success");
        }else
        {
            printf("cuda fail");
        }
        cudaFinishedCalculation = (Coordinate*)malloc((knn+1)*sizeof(Coordinate)*numOfJobsForCuda);
        cudaMemcpy(cudaFinishedCalculation, cudaKNearestCord, numOfJobsForCuda*((knn+1)*sizeof(Coordinate)), cudaMemcpyDeviceToHost);

and my kernel (cu):

__global__ void calcNCoordinates(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int cudaSizeToCalc)
{
    int i, j ,index;
    Coordinate* cudaTempKNearest = (Coordinate*)malloc((knn+1)*sizeof(Coordinate));
    int threadId = threadIdx.x; //get current thread Id
    index = threadId+startIndex;

    while(threadId < cudaSizeToCalc) //while treadId is smaller then the size to calc for cuda
    {
        calcKnnPerCoodinate(cudaAllCoordArr, cudaTempKNearest, knn, size, index, max);
        for(j = 0 ; j < knn+1 ; j++)
        {
            cudaKNearest[j+threadId*(knn+1)] = cudaTempKNearest[j];
        }
        threadId += blockDim.x; ////number of threads running in a block - 1024 in afeka pc
            index = threadId+startIndex; //update the current index
    }

    free(cudaTempKNearest);
}

void runCuda(Coordinate* cudaAllCoordArr, Coordinate* cudaKNearest,int startIndex, int knn, int size, Coordinate max, int maxNumberOfThreads, int numOfJobsForCuda)
{
    calcNCoordinates<<<1, maxNumberOfThreads>>>(cudaAllCoordArr, cudaKNearest, startIndex, knn, size, max, numOfJobsForCuda);
}

If I try running it with up to 7k coordinates it prints success and otherwise prints fail, I have tried cudaSyncronized(). I need the program to continue calculating until it finishes.

mustaccio
  • 18,234
  • 16
  • 48
  • 57
Coder123
  • 784
  • 2
  • 8
  • 29
  • 3
    `cudaMemcpy` will wait until all preceding activity has finished. The problem is not with `cudaMemcpy`, the problem is you have a bug in your code. You should use proper cuda error checking and run your code with `cuda-memcheck`. What happens if you run your code with `cuda-memcheck` ? – Robert Crovella Mar 21 '16 at 11:55
  • How do i use cuda-memcheck? – Coder123 Mar 21 '16 at 11:56
  • Try to add error checking after your cuda operations: https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api – Ander Biguri Mar 21 '16 at 11:58
  • 1
    it gives me : "cudaDeviceSynchronize returned error code 30 after launching addKernel!", im not sure what it is but google said to turn off the tdr, how can i do it? – Coder123 Mar 21 '16 at 16:24
  • 2
    did you try searching on "cuda tdr"? The very first hit I get when I do that is [this one](http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm). In addition, the subject is discussed in many questions here on the cuda tag. And regarding your previous question about `cuda-memcheck`, if I search on "cuda-memcheck", the very first hit I get is [this one](https://developer.nvidia.com/cuda-memcheck). – Robert Crovella Mar 21 '16 at 17:28
  • i sloved this by disabling the tdr, thanks! – Coder123 Mar 26 '16 at 18:15

0 Answers0