0

I am working on the basic CUDA program that only calculates square and cube. But I do not want to write all code in main thus I have separated into the functions some of them are template. No special purpose to create a template function. Only, I want to try it. The problem is related to if I call the function as naked such as cudaMalloc it is okay. If I call with my function, it fails. Let me show;

kernel.cuh

#ifndef KERNEL_CUH_
#define KERNEL_CUH_

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <exception>
#include <iostream>

struct GPUVars
{
private:
    size_t block_sz;
    size_t thread_sz;
public:
    GPUVars(size_t block, size_t thread) : block_sz{ block }, thread_sz{ thread } {};
    size_t GetBlockSize()const { return block_sz; };
    size_t GetThreadSize()const { return thread_sz; }
};


inline bool check_device()
{
    auto cuda_device_count{ 0 };
    cudaGetDeviceCount(&cuda_device_count);
    return cuda_device_count > 0;
}

template <typename T>
void AllocateMem(T* arr, size_t SIZE_BYTE)
{    
    if (cudaMalloc(&arr, SIZE_BYTE) != cudaSuccess)
    {
        throw std::bad_alloc();
    }
}

template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
    if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
    {
        throw std::bad_alloc();
    }
}

#endif

main.cpp

#include <iostream>
#include <random>
#include <iomanip>
#include <cassert>
#include "timer.h"
#include "cpu_calc.h"
#include "kernel.cuh"

template <typename T>
void RandNumberGen(T lower, T upper, T* arr, size_t SIZE_ARR)
{
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<> dis(lower, upper);

    for (size_t i = 0; i < SIZE_ARR; ++i)
    {
        arr[i] = dis(gen);
    }
}

int main()
{
    assert(check_device() == true);

    constexpr size_t SIZE_ARR{ 1024 };
    double input_arr[SIZE_ARR]{ 0 };
    RandNumberGen(1.0, 10000.0, input_arr, SIZE_ARR);
    constexpr size_t SIZE_BYTE = SIZE_ARR * sizeof(double);
    std::cout << std::setprecision(9) << std::fixed;
    double cpu_output[SIZE_ARR]{ 0 };

    // SQUARE
    auto time = CPUTimer(&cpu_output[0], &input_arr[0], SIZE_ARR, &CPUSquare);
    std::cout << "CPU square opeartion with " << SIZE_ARR << " size array takes " << std::setw(18) << time << " ns\n";


    GPUVars gpu_vars{ 0, 1024 };
    double* pgpu_input = nullptr;
    double gpu_output[SIZE_ARR];
    double* pgpu_output = nullptr;
    AllocateMem(pgpu_input, SIZE_BYTE); 
    AllocateMem(pgpu_output, SIZE_BYTE);
    CopyMemToDevice(input_arr, pgpu_input, SIZE_BYTE);
}

When I call CopyMemToDevice function, it throws an error due to cudaMemCpy function return that equal to cudaErrorInvalidValue.

Also, if I change CopyMemToDevice function to this still same;

template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
    AllocateMem(device_arr, SIZE_BYTE);
    if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess) // return 1 which is equal to cudaErrorInvalidValue
    {
        throw std::bad_alloc();
    }
}

When I write this function as below, it works perfectly;

template <typename T>
void CopyMemToDevice(const T* host_arr, T* device_arr, size_t SIZE_BYTE)
{
    cudaMalloc(&device_arr, SIZE_BYTE);  
    if (cudaMemcpy(device_arr, host_arr, SIZE_BYTE, cudaMemcpyHostToDevice) != cudaSuccess)
    {
        throw std::bad_alloc();
    }
}

Also, I know that AllocateMem function works, cudaMalloc return 0 which is cudaSuccess.

My question is what is the difference between calling a cudaMalloc and cudaMemcpy in the same function and different function? Why it gives cudaErrorInvalidValue : This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values. error when I call in the separated function? Thanks in advance.

I am using Visual Studio 2019 16.7.1 and CUDA 10.1

talonmies
  • 70,661
  • 34
  • 192
  • 269
Murat Hepeyiler
  • 430
  • 3
  • 12

1 Answers1

1

As Igor Tandetnik mentioned in the comment. The problem is only related to pass by value. I updated AllocateMem function as like that;

template <typename T>
void AllocateMem(T** arr, size_t SIZE_BYTE)
{    
    if (cudaMalloc(arr, SIZE_BYTE); != cudaSuccess)
    {
        throw std::bad_alloc();
    }
    
}

And call like this,

AllocateMem(&pgpu_output, SIZE_BYTE);

It works.

Murat Hepeyiler
  • 430
  • 3
  • 12