I tried to write an easy 3d convolution with cudnn but I encountered a vague error msg called after a successful compilation with NVCC: CUDNN_STATUS_BAD_PARAM on the line of cudnnGetConvolutionForwardAlgorithm_v7
. I checked the parameters inside this API and seemed Okay. Not know how to debug at this moment and any suggestion is appreciated. Thanks!
BTW, I am using CUDNN 8.4, CUDA 11.x and my operating system is Ubuntu 18.04. (Should not be version compatibility issue)
#include <cudnn.h>
#include <cuda.h>
#include <iostream>
#include <cstdlib>
#include <time.h>
using namespace std;
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on line " << __LINE__ << ": " \
<< cudnnGetErrorString(status) << std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
int main(int argc, char* argv[]){
int H = atoi(argv[1]);
int W = atoi(argv[2]);
int C = atoi(argv[3]);
int FH = atoi(argv[4]);
int FW = atoi(argv[5]);
int K = atoi(argv[6]);
double* input = new double[K*H*W];
double* kernel = new double[K*C*FH*FW];
for(int k=0; k<K; k++){
for(int c=0; c<C; c++){
for(int i=0; i<FH; i++){
for(int j=0; j<FW; j++){
kernel[k*FH*FW*C+c*FH*FW+i*FW+j] = (c+k) * (i+j);
}
}
}
}
for(int c=0; c<C; c++){
for(int i=0; i<H; i++){
for(int j=0; j<W; j++){
input[c*H*W+i*W+j] = c * (i+j);
}
}
}
double* output = new double[K*H*W];
cudnnHandle_t cudnn;
checkCUDNN(cudnnCreate(&cudnn));
cudnnTensorDescriptor_t input_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&input_descriptor));
int dimA[3] = {C, H, W};
int strideA[3] = {1, 1, 1};
checkCUDNN(cudnnSetTensorNdDescriptor(input_descriptor, CUDNN_DATA_DOUBLE, 3, dimA, strideA));
cudnnFilterDescriptor_t kernel_descriptor;
checkCUDNN(cudnnCreateFilterDescriptor(&kernel_descriptor));
checkCUDNN(cudnnSetFilter4dDescriptor(kernel_descriptor, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, FH, FW));
cudnnTensorDescriptor_t output_descriptor;
checkCUDNN(cudnnCreateTensorDescriptor(&output_descriptor));
int dimA2[3] = {K, W, H};
int strideA2[3] = {1, 1, 1};
checkCUDNN(cudnnSetTensorNdDescriptor(output_descriptor, CUDNN_DATA_DOUBLE, 3, dimA2, strideA2));
cudnnConvolutionDescriptor_t convolution_descriptor;
checkCUDNN(cudnnCreateConvolutionDescriptor(&convolution_descriptor));
int padding[3] = {0, 1, 1};
int strideA3[3] = {1, 1, 1};
int dilation[3] = {1, 1, 1};
checkCUDNN(cudnnSetConvolutionNdDescriptor(convolution_descriptor, 3, padding, strideA3, dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_DOUBLE));
cudnnConvolutionFwdAlgoPerf_t convolution_algorithm;
int perf_count;
checkCUDNN(cudnnGetConvolutionForwardAlgorithm_v7(cudnn, input_descriptor, kernel_descriptor, convolution_descriptor, output_descriptor, CUDNN_CONVOLUTION_FWD_ALGO_COUNT , &perf_count, &convolution_algorithm));
size_t workspace_bytes = 0;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnn,
input_descriptor,
kernel_descriptor,
convolution_descriptor,
output_descriptor,
convolution_algorithm.algo,
&workspace_bytes));
double* input_device;
double* kernel_device;
double* output_device;
void* d_workspace{nullptr};
cudaMalloc(&d_workspace, workspace_bytes);
cudaMalloc((void **)& input_device, C*H*W*sizeof(double));
cudaMalloc((void **)& kernel_device, K*C*FH*FW*sizeof(double));
cudaMalloc((void **)& output_device, K*H*W*sizeof(double));
cudaMemcpy(input_device, input, C*H*W*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(kernel_device, kernel, K*C*FH*FW*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(output_device, output, K*H*W*sizeof(double), cudaMemcpyHostToDevice);
const double alpha = 1, beta = 0;
struct timespec time_start = {0, 0};
struct timespec time_end = {0, 0};
clock_gettime(CLOCK_MONOTONIC, &time_start);
cudnnConvolutionForward(cudnn,
&alpha,
input_descriptor,
input_device,
kernel_descriptor,
kernel_device,
convolution_descriptor,
convolution_algorithm.algo,
d_workspace,
workspace_bytes,
&beta,
output_descriptor,
output_device);
clock_gettime(CLOCK_MONOTONIC, &time_end);
double exec_time = (time_end.tv_nsec - time_start.tv_nsec) / 1000000000.0;
cudaMemcpy(output, output_device, K*H*W*sizeof(double), cudaMemcpyDeviceToHost);
double checksum = 0.0;
for(int i=0; i<K*H*W; i++)
checksum += output[i];
cout<<"C3 checksum: "<<checksum<<"\t";
cout<<"C3 execution time: "<<exec_time<<endl;
cudaFree(input_device);
cudaFree(output_device);
cudaFree(kernel_device);
cudaFree(d_workspace);
cudnnDestroyTensorDescriptor(input_descriptor);
cudnnDestroyTensorDescriptor(output_descriptor);
cudnnDestroyFilterDescriptor(kernel_descriptor);
cudnnDestroyConvolutionDescriptor(convolution_descriptor);
cudnnDestroy(cudnn);
free(input);
free(output);
free(kernel);
return 0;
}