I am working on the basic CUDA program that only calculates square and cube. But I do not want to write all code in main
thus I have separated into the functions some of them are template. No special purpose to create a template function. Only, I want to try it. The problem is related to if I call the function as naked such as cudaMalloc
it is okay. If I call with my function, it fails. Let me show;
kernel.cuh
#ifndef KERNEL_CUH_
#define KERNEL_CUH_
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <exception>
#include <iostream>
struct GPUVars
{
private:
size_t block_sz;
size_t thread_sz;
public:
GPUVars(size_t block, size_t thread) : block_sz{ block }, thread_sz{ thread } {};
size_t GetBlockSize()const { return block_sz; };
size_t GetThreadSize()const { return thread_sz; }
};
inline bool check_device()
{
auto cuda_device_count{ 0 };
cudaGetDeviceCount(&cuda_device_count);
return cuda_device_count > 0;
}
template <typename T>
void AllocateMem(T* arr, size_t SIZE_BYTE)
{
if (cudaMalloc(&arr, SIZE_BYTE) != cudaSuccess)
{
throw std::bad_alloc();
}
}
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
{
throw std::bad_alloc();
}
}
#endif
main.cpp
#include <iostream>
#include <random>
#include <iomanip>
#include <cassert>
#include "timer.h"
#include "cpu_calc.h"
#include "kernel.cuh"
template <typename T>
void RandNumberGen(T lower, T upper, T* arr, size_t SIZE_ARR)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(lower, upper);
for (size_t i = 0; i < SIZE_ARR; ++i)
{
arr[i] = dis(gen);
}
}
int main()
{
assert(check_device() == true);
constexpr size_t SIZE_ARR{ 1024 };
double input_arr[SIZE_ARR]{ 0 };
RandNumberGen(1.0, 10000.0, input_arr, SIZE_ARR);
constexpr size_t SIZE_BYTE = SIZE_ARR * sizeof(double);
std::cout << std::setprecision(9) << std::fixed;
double cpu_output[SIZE_ARR]{ 0 };
// SQUARE
auto time = CPUTimer(&cpu_output[0], &input_arr[0], SIZE_ARR, &CPUSquare);
std::cout << "CPU square opeartion with " << SIZE_ARR << " size array takes " << std::setw(18) << time << " ns\n";
GPUVars gpu_vars{ 0, 1024 };
double* pgpu_input = nullptr;
double gpu_output[SIZE_ARR];
double* pgpu_output = nullptr;
AllocateMem(pgpu_input, SIZE_BYTE);
AllocateMem(pgpu_output, SIZE_BYTE);
CopyMemToDevice(input_arr, pgpu_input, SIZE_BYTE);
}
When I call CopyMemToDevice
function, it throws an error due to cudaMemCpy
function return that equal to cudaErrorInvalidValue
.
Also, if I change CopyMemToDevice
function to this still same;
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
AllocateMem(device_arr, SIZE_BYTE);
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess) // return 1 which is equal to cudaErrorInvalidValue
{
throw std::bad_alloc();
}
}
When I write this function as below, it works perfectly;
template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
cudaMalloc(&device_arr, SIZE_BYTE);
if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
{
throw std::bad_alloc();
}
}
Also, I know that AllocateMem
function works, cudaMalloc
return 0 which is cudaSuccess
.
My question is what is the difference between calling a cudaMalloc
and cudaMemcpy
in the same function and different function? Why it gives cudaErrorInvalidValue : This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values.
error when I call in the separated function? Thanks in advance.
I am using Visual Studio 2019 16.7.1 and CUDA 10.1