So I am trying to make a code where I can compare the performance of GPU vs CPU in executing merge sort using CUDA.
The CPU function works fine.
But the GPU function fails when I try to input more than 65536
. I am fairly new to CUDA and understand that I need to make a workaround in the part where I determine how many blocks and threads to be used. But I need help with that, can anyone give me some guidance. Thanks in advance.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <algorithm>
__global__ void mergeSortGPU(int* arr, int* temp, int size, int mergeSize) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int start = tid * mergeSize * 2;
int mid = start + mergeSize;
int end = start + 2 * mergeSize;
if (start >= size || mid >= size)
return;
if (end > size)
end = size;
int i = start;
int j = mid;
int k = start;
while (i < mid && j < end) {
if (arr[i] <= arr[j])
temp[k++] = arr[i++];
else
temp[k++] = arr[j++];
}
while (i < mid)
temp[k++] = arr[i++];
while (j < end)
temp[k++] = arr[j++];
for (int idx = start; idx < end; ++idx)
arr[idx] = temp[idx];
}
int main() {
int size;
std::cout << "Enter the size of the array: ";
std::cin >> size;
// Allocate memory for the array
int* arr = new int[size];
int* carr = new int[size];
int* temp = new int[size];
srand(static_cast<unsigned int>(time(nullptr)));
for (int i = 0; i < size; ++i) {
arr[i] = rand() % 100;
carr[i] = arr[i];
}
// GPU variables
int* gpuArr;
int* gpuTemp;
int maxThreadsPerBlock = 1024;
int threadsPerBlock = std::min(1024, size / 2);
int blocksPerGrid = (size + (2 * threadsPerBlock) - 1) / (2 * threadsPerBlock);
blocksPerGrid = std::max(blocksPerGrid, 1);
// Allocate memory on GPU
cudaMalloc((void**)&gpuArr, size * sizeof(int));
cudaMalloc((void**)&gpuTemp, size * sizeof(int));
// Copy the input array to GPU memory
cudaMemcpy(gpuArr, arr, size * sizeof(int), cudaMemcpyHostToDevice);
for (int mergeSize = 1; mergeSize < size; mergeSize *= 2) {
mergeSortGPU << <blocksPerGrid, threadsPerBlock >> > (gpuArr, gpuTemp, size, mergeSize);
cudaDeviceSynchronize();
}
// Free allocated memory
delete[] arr;
delete[] carr;
delete[] temp;
cudaFree(gpuArr);
cudaFree(gpuTemp);
return 0;
}
Tried change the blockspergrid
and threadsperblock
to maximum. But I have a feeling that it is not the right way.