Cuda cudaMemcpyDeviceToHost is giving error

Question

I am writing cuda version of merge sort and if I am using cudaMemcpyDeviceToHost in order to get back list of elements from GPU, it's giving memory error, on the other side if I am commenting out the line then the program is not sorting properly. Can anyone please suggest.

/* C program for Merge Sort with Cuda Technology*/
 #include<stdlib.h>
 #include<stdio.h>
 #include <cuda.h>
 #include <sys/time.h>

 #define THR1 1000
 #define THR2 10000

 #define N 800000

 /*
 ********************************
  Program UTILITY Code here
 ********************************
 */

 static void HandleError( cudaError_t err, const char *file, int line ) {
 if (err != cudaSuccess) {
 printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
 exit( EXIT_FAILURE );
 }}

 #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

 /* Function to print an array */
 void printArray(int A[], int size)
 {
 int i;
 for (i=0; i < size; i++)
    printf("%d ", A[i]);
 printf("\n");
}


 //Function to test if the output is in asending order or not
 void test(int a[], int n) {
  int i;
 for (i=1;i<n;++i) {
 if (a[i]<a[i-1]) {
  break;
 }
 }
if (i<n) { 
 for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
      break;
   }
}
if (i<n) {
  printf("\nArray is not sorted\n");
 }
 }
 else {
 printf("\nArray is sorted\n");
 }
 }
/*
*****************************************
 Sequential Version here
*****************************************
*/

 void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
    // store the current element as the key
    key = array[j] ;
    // get the element just before the current element
    int i = j - 1 ;
    // loop through all elements from the key to the min element
    // check if the current element is smaller than the key
    while (i >= min && array[i] > key)
    {
        // we move the current element backward
        array[i+1] = array[i] ;
        i-- ;
    }
    // we finally move the key
    array[i+1] = key ;
}
}



 void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];


// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
    // If left run head exists and is <= existing right run head.
    if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
    {
        tempArray[index] = array[firstIndex];
        firstIndex = firstIndex + 1;
    }

    else
    {
        tempArray[index] = array[secondIndex];
        secondIndex = secondIndex + 1;
    }

} 

// transfer to the initial array
for (int index = min ; index <= max ; index++)
    array[index] = tempArray[index];
  }



 void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
    insertionSort(array, min, max);
}
else
{
    // get the middle point
    int mid = (max+min) / 2;

    // apply merge sort to both parts of this
    smergeSort(array, min, mid, threshold);
    smergeSort(array, mid+1, max, threshold);

    // and finally merge all that sorted stuff
    merge(array, min, max, mid) ;
}
}

 /*
 *****************************************
 Parallel Version here
 *****************************************
 */
 __device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++) {
    if (i < middle && (j >= end || source[i] < source[j])) {
        dest[k] = source[i];
        i++;
    } else {
        dest[k] = source[j];
        j++;
    }
}
}

__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 
     middle, 
     end;

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)
        break;

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;
}
}

void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;

    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
    cudaDeviceSynchronize();

    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;
}

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));

}




 /* Driver program to test above functions */
 int main()
{


dim3 threadsPerBlock;
dim3 blocksPerGrid;

threadsPerBlock.x = 224;
blocksPerGrid.x = 10; 

int i, *a;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * N);        
 srand(time(0));
 for(i=0;i<N;i++)
       {
         a[i]=rand()%1000;
       }    
 printf("List Before Sorting...\n");
// printArray(a, N);

if (N<=THR2)
{
 clock_t begin = clock();
 smergeSort(a, 0, N - 1, THR2);
 clock_t end = clock(); 
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("SM");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d",N);

 exit(0);
}      

else 
{
 clock_t begin = clock();
 mergesort(a, N, threadsPerBlock, blocksPerGrid);
 clock_t end = clock();
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Cuda");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d\n",N);

 exit(0);
}      
}

Now program is working fine even for large elements, however, when I am using a large number of threads, let's say block 10 and threads 224 it;'s giving error :- an illegal memory access was encountered in mergesort.cu at line 215

After debugging the code I am getting below errors again:-

========= Invalid __global__ read of size 4
=========     at 0x00000148 in 
/home/sharmpra/mergesort.cu:150:gpu_mergesort(int*, int*, int, int, int, dim3*, dim3*)
=========     by thread (96,0,0) in block (9,0,0)
=========     Address 0x915fc0000 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204235]
=========     Host Frame:./a.out [0x1e831]
=========     Host Frame:./a.out [0x3c3d3]
=========     Host Frame:./a.out [0x38a8]
=========     Host Frame:./a.out [0x37b1]
=========     Host Frame:./a.out [0x3810]
=========     Host Frame:./a.out [0x33d1]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib64/libcuda.so.1 [0x2ef503]
=========     Host Frame:./a.out [0x3c0f6]
=========     Host Frame:./a.out [0x33da]
=========     Host Frame:./a.out [0x35ae]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf0) [0x20790]
=========     Host Frame:./a.out [0x2bc9]
=========

May please someone suggest what additional things I can implement in the program to remove such errors. Also, I am using this command line settings:-nvcc -o a.out -Wno-deprecated-gpu-targets -lineinfo -arch=compute_20,sm_20 -rdc=true -lcudadevrt mergesort.cu

There is a problem in your `gpu_mergesort` kernel code. Since you've not shown that, or even indeed not shown the exact error from `cudaMemcpy`, no one can help you beyond that. When you are asking for help in **debugging** some code (please read [here](http://stackoverflow.com/help/on-topic)<--click and start reading) you are **expected** to provide a [mcve]. What you have shown is not a [mcve]. It should be a complete code, that someone else could compile and run and see the error, without having to add anything or change anything. — Robert Crovella, May 17 '17 at 14:13
A fairly powerful method is available to you to start debugging such problems on your own. Read [here](http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218). If you follow that method, you can isolate the problem to a single line of kernel code that is causing the "memory error". — Robert Crovella, May 17 '17 at 14:16
@RobertCrovella thanks a lot for the suggestion regarding the debug tool, you are right problem is not in this function, I am uploading the rest of the code, meanwhile, I am also trying to figuring out the problem, it's showing error in bottomupmerge function. — Pragya Sharma, May 17 '17 at 14:57
I guess you're really not understanding what a complete code is, based on your most recent addition to the code. I'm not able to compile or run the code you have shown. I should not have to add anything to it. At a minimum you should provide a `main` routine which sets up all your data and properly calls your `mergesort` routine. If you want others to help you, make it as easy as possible for them to do so. Think about what a complete code means. — Robert Crovella, May 17 '17 at 15:02
@RobertCrovella I really apologize for that, I have updated the entire code. — Pragya Sharma, May 17 '17 at 15:15
OK you seem to have grasped what Complete means. Now let's work on what "Minimal" means. Remove anything that is not related to the problem at hand. For example, we don't need to see the timing code, right? And a bunch of host-side sorting code probably isn't necessary, right? Re-write your program to just call the mergesort routine with appropriate data, while eliminating the things that don't pertain to the problem you are having. What you end up with still needs to be a complete code that someone else can compile, and run, and see the error. — Robert Crovella, May 17 '17 at 15:25
Okay, thanks, I will follow your suggestion and hopefully, it should work. :) — Pragya Sharma, May 17 '17 at 15:27
@RobertCrovella also I would like to mention that when I used your suggested debugging tool I am getting errors related to gpu_mergesort function as you suspected, I have also added the errors in the code section. — Pragya Sharma, May 17 '17 at 15:37
the `cuda-memcheck` output you have included shows that you are not compiling with `-lineinfo`, which is necessary for `cuda-memcheck` to tell you the exact line that the out-of-bounds `__global__` read occurred at. Perhaps you should re-read [this](http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218). — Robert Crovella, May 17 '17 at 15:40
Thanks, now it's showing exact line number which is line 134:- if (i < middle && (j >= end || source[i] < source[j])) { line in gpu_bottomUpMerge function. — Pragya Sharma, May 17 '17 at 15:55
So the next step in your debugging sequence is to ask yourself "which operations in that line of code could be resulting in a invalid global read of size 4? " That answer to that is that the code must read both `source[i]` and `source[j]` from global memory, each of which are `int` quantities (size 4 bytes). The only way these could be a problem is if the `i` or `j` indices are out of range for the size of the `source` array. So you may want to insert a line of code that checks to see if `i` or `j` is too big (or negative), and I'm willing to bet that check will certainly be hit. — Robert Crovella, May 17 '17 at 18:01
You'll then need to work backward in your code to find out why your code is setting values of `i` or `j` that are out of range for the `source` array. — Robert Crovella, May 17 '17 at 18:02
@RobertCrovella thanks for the explanation, I am trying to find out the reason, however, kind of stuck. — Pragya Sharma, May 17 '17 at 18:40
@RobertCrovella thanks to your guidance I was able to remove the errors although with higher number of threads its showing same error again. — Pragya Sharma, May 19 '17 at 10:45
Seems like you figured it out based on your cross-posting [here](https://devtalk.nvidia.com/default/topic/1009645/cuda-programming-and-performance/program-is-giving-an-illegal-memory-access-was-encountered-error-with-higher-number-of-threads/). — Robert Crovella, May 20 '17 at 18:28
@RobertCrovella yes thanks a lot for all your guidance, I just made 2 changes in code,1.) I copied all source elements to destination in the beginning and then made the comparison between source[i] < dest [j]. and 2.) instead of int I used long. :) — Pragya Sharma, May 23 '17 at 23:36
You can answer your own question if you want. If you have your code working now, and can articulate the important changes, it may be useful for future readers. If you post an answer that includes a working code and a description of how you got there, I would probably upvote that answer. — Robert Crovella, May 23 '17 at 23:38
@RobertCrovella thanks for the suggestion, sure I will do that. :) — Pragya Sharma, May 24 '17 at 07:18
@RobertCrovella I have answered the and also I have added the code. :) — Pragya Sharma, May 25 '17 at 14:59

score 1 · Accepted Answer · answered May 25 '17 at 14:58

As explained by @Robert that the code was reading both source[i] and source[j] from global memory, each of which is int quantities (size 4 bytes), so I tried to avoid using the same array for comparison and I added :- for (int k = start; k < end; k++) dest[k] = source[k]; in gpu_bottomUpMerge, by adding this line in my code it was working for more blocks and threads but still giving illegal memory error for the large numbers of elements, so, to resolve that issue I used pointers and instead of int, I used long. Below is the updated version of the program :

/* C program for Merge Sort with Cuda Technology*/
#include<stdlib.h>
#include<stdio.h>
#include <cuda.h>
#include <sys/time.h>

#define THR1 1000
#define THR2 10000

#define N 800000

/*
********************************
Program UTILITY Code here
********************************
*/

static void HandleError( cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
exit( EXIT_FAILURE );
}}

#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

/* Function to print an array */
void printArray(int A[], int size)
{
int i;
for (i=0; i < size; i++)
    printf("%d ", A[i]);
printf("\n");
}


//Function to test if the output is in ascending order or not
void test(int a[], int n) {
int i;
for (i=1;i<n;++i) {
if (a[i]<a[i-1]) {
  break;
}
}
if (i<n) { 
for (i=1;i<n;++i) {  
   if (a[i]>a[i-1]){
      break;
   }
 }
if (i<n) {
  printf("\nArray is not sorted\n");
}
}
else {
printf("\nArray is sorted\n");
}
}
/*
*****************************************
Sequential Version here
*****************************************
*/

void insertionSort(int array[], int min, int max)
{
int key ;
// we loop through all elements in the original array from the min + 1 element
for (int j = min + 1 ; j <= max ; j++)
{
    // store the current element as the key
    key = array[j] ;
    // get the element just before the current element
    int i = j - 1 ;
    // loop through all elements from the key to the min element
    // check if the current element is smaller than the key
    while (i >= min && array[i] > key)
    {
        // we move the current element backward
        array[i+1] = array[i] ;
        i-- ;
    }
    // we finally move the key
    array[i+1] = key ;
 }
 }



void merge(int array[], int min, int max, int mid)
{
int firstIndex = min;
int secondIndex = mid + 1;
int * tempArray = new int [max + 1];


// While there are elements in the left or right runs
for (int index = min; index <= max; index++) {
    // If left run head exists and is <= existing right run head.
    if (firstIndex <= mid && (secondIndex > max || array[firstIndex] <= array[secondIndex]))
    {
        tempArray[index] = array[firstIndex];
        firstIndex = firstIndex + 1;
    }

    else
    {
        tempArray[index] = array[secondIndex];
        secondIndex = secondIndex + 1;
    }

} 

// transfer to the initial array
for (int index = min ; index <= max ; index++)
    array[index] = tempArray[index];
}



void smergeSort(int array[], int min, int max, int threshold)
{
// prerequisite
if ( (max - min + 1) <= threshold )
{
    insertionSort(array, min, max);
}
else
{
    // get the middle point
    int mid = (max+min) / 2;

    // apply merge sort to both parts of this
    smergeSort(array, min, mid, threshold);
    smergeSort(array, mid+1, max, threshold);

    // and finally merge all that sorted stuff
    merge(array, min, max, mid) ;
 }
 }

/*
*****************************************
 Parallel Version here
*****************************************
*/
 __device__ void gpu_bottomUpMerge(int* source, int* dest, int start, int middle, int end) {
int i = start;
int j = middle;
for (int k = start; k < end; k++)
dest[k] = source[k];
for (int k = start; k < end; k++) {
    if (i < middle && (j >= end || source[i] < dest[j])) {
        dest[k] = source[i];
        i++;
    } else {
        dest[k] = source[j];
        j++;
    }
 }
}

__global__ void gpu_mergesort(int* source, int* dest, int size, int width, int slices, dim3* threads, dim3* blocks) {

int idx = blockDim .x * blockIdx .x + threadIdx .x;

int start = width*idx*slices, 
     middle, 
     end;

for (int slice = 0; slice < slices; slice++) {
    if (start >= size)
        break;

    middle = min(start + (width >> 1), size);
    end = min(start + width, size);
    gpu_bottomUpMerge(source, dest, start, middle, end);
    start += width;
}
}

void mergesort(int* data, int size, dim3 threadsPerBlock, dim3 blocksPerGrid) {

// Allocate two arrays on the GPU we switch back and forth between them during the sort

int* D_data;
int* D_swp;
dim3* D_threads;
dim3* D_blocks;

// Actually allocate the two arrays

HANDLE_ERROR(cudaMalloc((void**) &D_data, size * sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**) &D_swp, size * sizeof(int)));

// Copy from our input list into the first array
HANDLE_ERROR(cudaMemcpy(D_data, data, size * sizeof(int), cudaMemcpyHostToDevice));  

int* A = D_data;
int* B = D_swp;

int nThreads = threadsPerBlock.x * threadsPerBlock.y * threadsPerBlock.z * blocksPerGrid.x * blocksPerGrid.y * blocksPerGrid.z;

// Divide the list and give pieces of it to each thread, letting the pieces grow bigger and bigger until the whole list is sorted
for (int width = 2; width < (size << 1); width <<= 1) {
    int slices = size / ((nThreads) * width) + 1;

    // Actually call the kernel
    gpu_mergesort<<<blocksPerGrid, threadsPerBlock>>>(A, B, size, width, slices, D_threads, D_blocks);
    cudaDeviceSynchronize();

    // Switch the input / output arrays instead of copying them around
    A = A == D_data ? D_swp : D_data;
    B = B == D_data ? D_swp : D_data;
}

// Get the list back from the GPU 
HANDLE_ERROR(cudaMemcpy(data, A, size * sizeof(int), cudaMemcpyDeviceToHost));

// Free the GPU memory
HANDLE_ERROR(cudaFree(A));
HANDLE_ERROR(cudaFree(B));

}




/* Driver program to test above functions */
int main()
{


dim3 threadsPerBlock;
dim3 blocksPerGrid;

threadsPerBlock.x = 122;
blocksPerGrid.x = 1; 

int i, *a;

 printf("How many elements in the array? ");

 a = (int *)malloc(sizeof(int) * N);        
 srand(time(0));
 for(i=0;i<N;i++)
       {
         a[i]=rand()%1000;
       }    
 printf("List Before Sorting...\n");
// printArray(a, N);

if (N<=THR2)
{
 clock_t begin = clock();
 smergeSort(a, 0, N - 1, THR2);
 clock_t end = clock(); 
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("SM");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d",N);

 exit(0);
}      

else 
{
 clock_t begin = clock();
 mergesort(a, N, threadsPerBlock, blocksPerGrid);
 clock_t end = clock();
 printf("\nSorted array:  ");
 //printArray(a,N);
 printf("\n");
 test(a,N);
 double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
 printf("Cuda");
 printf("Elapsed: %f seconds\n",time_spent );
 printf("\nSize of the array is %d\n",N);

 exit(0);
}      
}

Cuda cudaMemcpyDeviceToHost is giving error

1 Answers1