In my current project, a call to cudaGetLastError()
is returning unknown error
and I don't know why. The code compiles just fine, but it is not behaving how I would like it to.
Below is a brief, not compilable example of what the relevant code consists of:
CU_Main.cu
Below is the CUDA kernel:
//My CUDA kernel
__global__ void CU_KernelTest(Kernel* matrix){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx == 0 && idy == 0){
printf("ID is: %d\n", idx);
matrix->set(1,1, 16.0f);
}
}
Here is the host code:
//A host function which is called when a button is clicked
int HOST_OnbuttonClick(){
Kernel* matrix = new Kernel(3,3,2);
Kernel* device_matrix;
cudaMalloc(&device_matrix, sizeof(Kernel));
cudaMemcpy(device_matrix, matrix, sizeof(Kernel), cudaMemcpyHostToDevice);
CU_KernelTest<<<256, 256>>>(device_matrix);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
printf("Error: %s\n", cudaGetErrorString(err));
}
cudaFree(device_matrix);
return 0.0f;
}
When matrix->set(1,1, 16.0f);
is included in the cuda kernel, (err != cudaSuccess)
returns true and prints out UNKNOWN ERROR
, whereas if I comment set
out, i get no error.
The other struct
relevant to this is my own helper for a convolution kernel design I'm going for, naturally called Kernel
.
Kernel.cuh
struct Kernel {
private :
float* kernel;
int rows;
int columns;
public :
__device__ __host__
Kernel(int _rows, int _columns, float _default) {
rows = _rows;
columns = _columns;
kernel = new float[rows * columns];
for(int r = 0; r < rows; r++){
for(int c = 0; c < columns; c++){
kernel[r * rows + c] = _default;
}
}
}
__device__ __host__
void set(int row, int col, float value){
kernel[row * rows + col] = value;
}
}
The goal of this design is to be able to set all values for the kernel on the host, send it to the CUDA kernel, set values there and then retrieve the updated object back at the host.
So, there are two issues really, why would I get an unknown error
message, and is the code syntactically correct that it should work?
Let me know if more information is needed.
Here are the results of the memory checker:
Nsight Debug
================================================================================
CUDA Memory Checker detected 1 threads caused an access violation:
Launch Parameters
CUcontext = 071c7340
CUstream = 08f3e3b8
CUmodule = 08fa97a8
CUfunction = 08fdbbe8
FunctionName = _Z13CU_KernelTestP6Kernel
gridDim = {1,1,1}
blockDim = {256,1,1}
sharedSize = 128
Parameters:
matrix = 0x06b60000 {kernel = 0x07a31718 ???, rows = 3, columns = 3}
Parameters (raw):
0x06b60000
GPU State:
Address Size Type Mem Block Thread blockIdx threadIdx PC Source
-----------------------------------------------------------------------------------------------
07a31728 4 adr st g 0 0 {0,0,0} {0,0,0} 000260 c:\users
Summary of access violations:
c:\users....kernel.cuh(26): error MemoryChecker: #misaligned=0 #invalidAddress=2