Robert Crovella's the authoritative voice here and says it isn't possible for a kernel assert to get information about itself back to the host. So we need some workarounds.
A major complicating factor is that if assert
is called on the device then we are no longer able to communicate with it from the host, so any data we write to device memory is lost forever. (Reference).
Below I offer three:
- Using Unified Memory to pass info from the GPU to the CPU even "after" an assert is called. This is the best answer.
- Improving the GPU's assertion error messages by passing stacks to the GPU.
- Passing info from the GPU to the CPU by dropping asserts and writing to memory. You'd only use this if UVM wasn't an option for some reason.
Using Unified Memory
Unified Memory allows the CUDA device and the host to transparently shuffle bits between each other without the need for cudaMemcpy
. The result is that even though throwing an assert blocks our access to the device via regular API calls, we can still transfer signals back to the CPU via the Unified Memory.
Note that if we want kernels to be able to run asynchronously we need a way to associate kernel launches with assertion failures. The circular buffer here provides this functionality.
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
// Copy string from `src` to `dst`
__device__ void dstrcpy(char *dst, const char *src){
for(;*src!='\0';dst++,src++){
*dst = *src;
}
*dst = '\0';
}
// Used to hold assertion data generated by the device
struct AssertionData {
char assertion_msg[1000];
char filename[1000];
char function_name[1000];
int line_number;
uint32_t caller;
dim3 block_id;
dim3 thread_id;
};
// Used to hold assertions generated by the device
struct AssertionsData {
int assertion_count;
AssertionData assertions[DEVICE_SIDE_ASSERTION_COUNT];
// Get the next place to insert an assertion failure message
__device__ int next_id(){
// Atomically increment so other threads can fail at the same time
return atomicAdd(&assertion_count, 1);
}
__device__ void insert(
const char *assertion_msg0,
const char *filename0,
const char *function_name0,
const int line_number0,
const uint32_t caller0,
const dim3 block_id0,
const dim3 thread_id0
){
const auto nid = next_id();
if(nid>DEVICE_SIDE_ASSERTION_COUNT){
printf("RAN OUT OF ASSERTION BUFFER SPACE!");
return;
}
auto& self = assertions[nid];
dstrcpy(self.assertion_msg, assertion_msg0);
dstrcpy(self.filename, filename0);
dstrcpy(self.function_name, function_name0);
self.line_number = line_number0;
self.caller = caller0;
self.block_id = block_id0;
self.thread_id = thread_id0;
}
};
// Pointer to device memory allocated to hold assertion failure messages
AssertionsData *uvm_assertions = nullptr;
// Use to hold stack traces generated by the host so that we can run kernels
// asynchronously and still associate stacks to assertion failures
struct StackTraceInfo {
boost::stacktrace::stacktrace stacktrace;
int device;
cudaStream_t stream;
uint32_t generation_number;
StackTraceInfo() = default;
StackTraceInfo(int generation_number0, cudaStream_t stream0) {
// Technically we'd want to lop the top few layers off of this
generation_number = generation_number0;
stacktrace = boost::stacktrace::stacktrace();
CUDA_CHECK_API_CALL(cudaGetDevice(&device));
stream = stream0;
}
};
// Circular buffer used to hold stacks generated by the host
struct CircularTraceBuffer {
// Assume that this is the max number of items that might ever be enqueued
// across all streams
static constexpr int max_size = 1024;
// How many stacktraces we've inserted. Used to ensure that circular queue
// doesn't provide false information by always increasing, but also to mark
// where we are inserting into the queue
uint32_t generation_number = 0;
// The buffer
std::array<StackTraceInfo, max_size> traces;
uint32_t insert(cudaStream_t stream_id) {
traces[generation_number % max_size] = StackTraceInfo(generation_number, stream_id);
return generation_number++;
}
};
// Circular buffer of host stacktraces for associating with kernel launches
CircularTraceBuffer circular_trace_buffer;
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, assertions_data, caller) \
do { \
if (! (condition)) { \
/* Atomically increment so other threads can fail at the same time */ \
assertions_data->insert( \
TOSTRING(condition), \
__FILE__, \
__FUNCTION__, \
__LINE__, \
caller, \
blockIdx, \
threadIdx \
); \
\
assert(condition); \
} \
} while (false);
// NOTE: Our kernels now need a pointer to the assertions data and an id for the caller
// NOTE: We can simplify our code by assuming these variables always have the same names
// so that they do not need to be passed to the preprocessor macro
__global__ void my_failing_kernel(int x, AssertionsData *const assertions_data, const uint32_t caller){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, assertions_data, caller);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
if(cudaDeviceSynchronize()==cudaSuccess){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace()<<std::endl;
for(int i=0;i<uvm_assertions->assertion_count;i++){
std::cout<<"Assertion failure "<<i<<std::endl;
const auto &self = uvm_assertions->assertions[i];
const auto &stack = circular_trace_buffer.traces[self.caller];
std::cout<<"GPU "<<self.filename<<":"
<<self.line_number<<"("
<<self.function_name<<"): "
<<self.assertion_msg<<std::endl;
if(stack.generation_number == self.caller){
std::cout<<stack.stacktrace
<<"Device = "<<stack.device<<", "
<<"Stream = "<<stack.stream
<<std::endl;
} else {
std::cout<<"CPU stack has been overwritten!"<<std::endl;
}
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMallocManaged(&uvm_assertions, sizeof(AssertionsData)));
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId
));
// GPU will establish direct mapping of data in CPU memory, no page faults will be generated
CUDA_CHECK_API_CALL(cudaMemAdvise(
uvm_assertions, sizeof(AssertionsData), cudaMemAdviseSetAccessedBy, 0
));
my_failing_kernel<<<1, 1, 0>>>(4, uvm_assertions, circular_trace_buffer.insert(0));
my_failing_kernel<<<1, 1, 0>>>(5, uvm_assertions, circular_trace_buffer.insert(0));
CUDA_CHECK_KERNEL_SUCCESS();
CUDA_CHECK_API_CALL(cudaFree(uvm_assertions));
return 0;
}
The output for the above is:
main_assert_um_from_device.cu:162: void my_failing_kernel(int, AssertionsData *, unsigned int): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
CUDA API call failure detected at (main_assert_um_from_device.cu:167:0):
0# 0x000055D3D8CEAFF2 in ./a.out
1# 0x000055D3D8CEB700 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x000055D3D8CEADAE in ./a.out
Assertion failure 0
GPU main_assert_um_from_device.cu:162(my_failing_kernel): x!=5
0# 0x000055D3D8CECEF9 in ./a.out
1# 0x000055D3D8CED135 in ./a.out
2# 0x000055D3D8CEB6B9 in ./a.out
3# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
4# 0x000055D3D8CEADAE in ./a.out
Device = 0, Stream = 0
Better Assert Messages
The first work around is to make the device assert message better. To do so, we collect stacktrace strings on the host and transfer them to the GPU. Then, when we call a kernel we pass a pointer to the stacktrace string. If the kernel fails an assertion condition we print out the stacktrace before triggering the assertion.
The code for that is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/stacktrace.hpp>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
// Print a beefy kernel assertion message followed by inducing failure using
// the actual assertion
#define CUDA_DEVICE_ASSERT_WITH_STACKTRACE(condition, message) \
do { \
if (! (condition)) { \
printf("Assertion '%s' failed at %s:%d as part of stacktrace:\n%s", \
TOSTRING(condition), \
__FILE__, \
__LINE__, \
message); \
} \
/* Perform actual assertion to stop kernel progress */ \
assert(condition); \
} while (false)
__global__ void my_failing_kernel(int x, const char *d_stacktrace){
CUDA_DEVICE_ASSERT_WITH_STACKTRACE(x!=5, d_stacktrace);
}
// Increases performance by cacheing stack traces so we don't repeatedly
// transfer the same data to the GPU
std::unordered_map<std::string, char*> cached_stacks;
// Send a stacktrace to the GPU, cache the pointer it's stored at, return
// said pointer
char* setup_device_stacktrace(){
std::stringstream ss;
ss << boost::stacktrace::stacktrace();
const auto cached_stack = cached_stacks.find(ss.str());
if(cached_stack!=cached_stacks.end()){
std::cerr<<"Using cached stacktrace!"<<std::endl;
return cached_stack->second;
}
char *d_stacktrace = nullptr;
cudaMalloc(&d_stacktrace, 10000);
cudaMemcpy(d_stacktrace, ss.str().c_str(), ss.str().size(), cudaMemcpyHostToDevice);
cached_stacks[ss.str()] = d_stacktrace;
return d_stacktrace;
}
// Make an interesting stack
void nested_n(int depth, int val){
if(depth<5){
nested_n(depth+1, val);
} else {
const char* d_stacktrace = setup_device_stacktrace();
my_failing_kernel<<<1, 1>>>(val, d_stacktrace);
cudaDeviceSynchronize();
}
}
// Make an interesting stack
void nested3(int val){ nested_n(0, val); }
void nested2(int val){ nested3(val); }
void nested1(int val){ nested2(val); }
int main(){
for(int i=4;i<6;i++){
std::cerr<<"Running with value = "<<i<<std::endl;
nested1(i);
}
// Clean-up
for(const auto &x: cached_stacks){
cudaFree(x.second);
}
return 0;
}
This gives the output:
Running with value = 4
Running with value = 5
Using cached stacktrace!
Assertion 'x!=5' failed at main.cu:31 as part of stacktrace:
0# 0x000055BBF4A3CF76 in ./a.out
1# 0x000055BBF4A3D262 in ./a.out
2# 0x000055BBF4A3D258 in ./a.out
3# 0x000055BBF4A3D258 in ./a.out
4# 0x000055BBF4A3D258 in ./a.out
5# 0x000055BBF4A3D258 in ./a.out
6# 0x000055BBF4A3D258 in ./a.out
7# 0x000055BBF4A3D313 in ./a.out
8# 0x000055BBF4A3D32F in ./a.out
9# 0x000055BBF4A3D34B in ./a.out
10# 0x000055BBF4A3D3CF in ./a.out
11# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
12# 0x000055BBF4A3CE0E in ./a.out
main.cu:31: void my_failing_kernel(int, const char *): block: [0,0,0], thread: [0,0,0] Assertion `x!=5` failed.
Replace The Device Assertion With Magic
Here the idea is to replace the device-side assert with our Own Special Assert. Our OSA will write information about itself to device-side and the host will read this to see what went wrong. Note that we'd only want to do this if the Unified Memory solution wasn't possible for some reason.
Here, rather than have the kernel fail with an assert, we have any failing threads early-exit the kernel while the rest of the threads continuing working. The result is garbage, but at least we can get information about why!
The code for this is:
//Compile with nvcc -g main.cu -lboost_stacktrace_basic -ldl
#define BOOST_STACKTRACE_USE_ADDR2LINE
#include <boost/assert/source_location.hpp>
#include <boost/stacktrace.hpp>
#include <array>
#include <cassert>
#include <iostream>
// Pointer to device memory allocated to hold assertion failure messages
char *d_assert_buffer = nullptr;
// Number of assertion failure messages we can store. If this is too small
// threads will fail silently.
#define DEVICE_SIDE_ASSERTION_COUNT 10
// Length of each assertion failure message - if this is too small we get
// garbage as threads overwrite each other
#define DEVICE_SIDE_ASSERTION_LENGTH 500
// Total size of the assertion failure message buffer. First 4 bytes stores the
// number of logged messages
#define DEVICE_SIDE_ASSERTION_BUFFER_LEN (4 + DEVICE_SIDE_ASSERTION_COUNT * DEVICE_SIDE_ASSERTION_LENGTH)
// Used by preprocessor to convert things to strings
#define STRINGIFY(x) #x
#define TOSTRING(x) STRINGIFY(x)
#define LINE_STRING TOSTRING(__LINE__)
// Emulates a kernel assertion. The assertion won't stop the kernel's progress, so you
// should assume everything the kernel produces is garbage if there's an assertion failure.
#define CUDA_COMMUNICATING_KERNEL_ASSERTION(condition, buffer) \
do { \
if (! (condition)) { \
/* First four bytes of the buffer indicate which buffer we're using */ \
uint32_t *const msgnum_ptr = reinterpret_cast<uint32_t*>(d_assert_buffer); \
/* Atomically increment so other threads can fail at the same time */ \
const uint32_t msg_num = atomicAdd(msgnum_ptr, 1); \
if(msg_num>=DEVICE_SIDE_ASSERTION_COUNT){ \
printf("RAN OUT OF ASSERTION BUFFER SPACE!\n"); \
return; \
} \
\
/* Find the start of the buffer we'll be writing to */ \
char *const msg_ptr = d_assert_buffer + 4 + msg_num * DEVICE_SIDE_ASSERTION_LENGTH; \
\
constexpr char const assertion_string[] = TOSTRING(x==5); \
constexpr char const line_string[] = LINE_STRING; \
constexpr int assertion_size = sizeof(assertion_string); \
constexpr int filename_size = sizeof(__FILE__)-1; \
\
/* __LINE__ gets turned into a buffer of length 6, it seems, so we need to find */ \
/* the actual length in order to print the message */ \
int line_size = 0; \
for(int i=0;i<20;i++){ \
if(line_string[i]!='\0'){ \
line_size++; \
} else { \
break; \
} \
} \
\
memcpy(msg_ptr, __FILE__, filename_size); \
msg_ptr[filename_size] = ':'; \
memcpy(msg_ptr+filename_size+1, line_string, line_size); \
msg_ptr[filename_size+1+line_size] = ':'; \
memcpy(msg_ptr+filename_size+1+line_size+1, assertion_string, assertion_size); \
msg_ptr[filename_size+1+line_size+1+assertion_size] = '\0'; \
/* If we actually assert then we can't ever get the message to the host, so we */ \
/* return and let the kernel generate garbage */ \
return; \
} \
} while (false);
// Standard CUDA success check
#define CUDA_CHECK_API_CALL(error) \
do { \
const auto error_code = error; \
if(error_code!=cudaSuccess){ \
std::cout<<"CUDA API call failure detected at ("<<__FILE__<<":"<<__LINE__<<"): "<<cudaGetErrorString(error_code)<<std::endl; \
std::cout<< boost::stacktrace::stacktrace() << std::endl; \
}} while(false)
__global__ void my_failing_kernel(int x, char *d_assert_buffer){
CUDA_COMMUNICATING_KERNEL_ASSERTION(x!=5, d_assert_buffer);
}
// Check that kernels ran correctly by acquiring the message buffer. BLOCKING.
void CUDA_CHECK_KERNEL_SUCCESS(const boost::source_location& location = BOOST_CURRENT_LOCATION){
std::array<char, DEVICE_SIDE_ASSERTION_BUFFER_LEN> cuda_assert_buffer = {0};
CUDA_CHECK_API_CALL(cudaDeviceSynchronize());
assert(d_assert_buffer!=nullptr);
// NOTE: We could maybe save time by only moving the message count initially and copying the messages
// conditionally.
CUDA_CHECK_API_CALL(cudaMemcpy(cuda_assert_buffer.data(), d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN, cudaMemcpyDeviceToHost));
CUDA_CHECK_API_CALL(cudaDeviceSynchronize()); // NOTE: Needed for buffers of <64kB
const uint32_t& msg_num = *reinterpret_cast<uint32_t*>(cuda_assert_buffer.data());
if(msg_num==0){
return;
}
std::cout<<"CUDA API call failure detected at ("<<location.file_name()<<":"<<location.line()<<":"<<location.column()<<"): "<<std::endl;
std::cout<< boost::stacktrace::stacktrace();
std::cout<<"Assertion messages ("<<msg_num<<" messages):"<<std::endl;
for(int i=0;i<msg_num;i++){
std::cout<<" "<<i<<" "<<cuda_assert_buffer.data()+(4+i*DEVICE_SIDE_ASSERTION_LENGTH)<<std::endl;
}
}
int main(){
CUDA_CHECK_API_CALL(cudaMalloc(&d_assert_buffer, DEVICE_SIDE_ASSERTION_BUFFER_LEN));
my_failing_kernel<<<1, 1>>>(4, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
my_failing_kernel<<<1, 1>>>(5, d_assert_buffer);
CUDA_CHECK_KERNEL_SUCCESS();
// Clean-up
cudaFree(d_assert_buffer);
return 0;
}
And the output looks like:
CUDA API call failure detected at (main_assert_from_device.cu:91:0):
0# 0x00005573A1F633A5 in ./a.out
1# 0x00005573A1F637C2 in ./a.out
2# __libc_start_main in /lib/x86_64-linux-gnu/libc.so.6
3# 0x00005573A1F62D9E in ./a.out
Assertion messages (1 messages):
0 main_assert_from_device.cu:86:x==5