I'm debugging some lengthy code which involves some cuda operations.
I' currently getting the above mentioned error during a call to cudaMemcpy(...,...,cudaMemcpyHostToDevice)
but I'm not sure it is speficially related to that.
Here is a code snippet:
int num_elements = 8294400; // --> I also tried it with "1" here which didn't work either!
float *checkArray = new float[num_elements];
float *checkArray_GPU;
CUDA_CHECK(cudaMalloc(&checkArray_GPU, num_elements * sizeof(float)));
CUDA_CHECK(cudaMemcpy(checkArray_GPU, checkArray, num_elements * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(checkArray, checkArray_GPU, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
where CUDA_CHECK is simply a macro for printing any cuda error (this was part of the existing code and works fine for all other cudaMemcpy oder cudaMalloc calls so it is not part of the problem). Strangely this code snippet executed separately in a toy *.cu
example works fine.
So my assumption is that due to previous cuda operations in the program, there have been some errors which have not been reported that cause the bug in the code snippet above. Could that be? Is there a way to check if there is some unreported error involving cuda?
My other estimate is that it might come from the specific graphic card I'm using. I have a Nvidia Titan X Pascal, Cuda 8.0 and cudnn v5.1. I also tried to compile my code using some special compiler flags like
-arch=sm_30 \
-gencode=arch=compute_20,code=sm_20 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
but it didn't help so far. Here is my current simplified Makefile for completeness:
NVCC = nvcc
CUDA_INC = -I/usr/local/cuda/include
CUDA_LIB = -L/usr/local/cuda/lib64
TARGET = myProgramm
OPTS = -std=c++11
$(TARGET).so: $(TARGET).o
$(NVCC) $(OPTS) -shared $(TARGET).o $(CUDA_LIB) -o $(TARGET).so
$(TARGET).o: $(TARGET).cu headers/some_header.hpp
$(NVCC) $(OPTS) $(CUDA_INC) -Xcompiler -fPIC -c $(TARGET).cu
Has anyone an idea how I could get to the bottom of this?
Edit:
cuda-memcheck was a good idea, so the error apparantly happens earlier during a call of Kernel_set_value
:
========= Invalid __global__ write of size 4
========= at 0x00000298 in void Kernel_set_value<float>(unsigned long, unsigned long, float*, float)
========= by thread (480,0,0) in block (30,0,0)
========= Address 0x0005cd00 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x209035]
[...]
========= Host Frame:/media/.../myProgramm.so (_ZN5boost6python6detail6invokeIiPFvRKSsENS0_15arg_from_pythonIS4_EEEEP7_objectNS1_11invoke_tag_ILb1ELb0EEERKT_RT0_RT1_ + 0x2d) [0x3e5eb]
[...]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f4e33]
========= Host Frame:/media/.../myProgramm.so [0x7489f]
F0703 16:23:54.840698 26207 myProgramm.cu:411] Check failed: error == cudaSuccess (4 vs. 0) unspecified launch failure
[...]
========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:python [0x177c2e]
=========
*** Check failure stack trace: ***
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found
but also the function Kernel_set_value
works fine in a toy example. Is there anything special to consider when using Kernel_set_value
. This is it's source code and it's respective helper functions.
#define CUDA_NUM_THREADS 512
#define MAX_NUM_BLOCKS 2880
inline int CUDA_GET_BLOCKS(const size_t N) {
return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1) / CUDA_NUM_THREADS));
}
inline size_t CUDA_GET_LOOPS(const size_t N) {
size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS;
return (N + total_threads -1)/ total_threads;
}
template <typename Dtype>
__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, Dtype* GPUdst, Dtype value){
const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x));
if (idxBase >= N) return;
for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx ){
GPUdst[idx] = value;
}
}