I need to print a few values from a CUDA kernel, and have tried using cuPrintf. My compute capability is 1.1, and so I cannot use printf. The program compiles correctly and does not give any runtime error either. However, the cuPrintf lines seem to be doing nothing at all. Here are some of the things I tried:
- Compile with -arch sm_11
- Surround each kernel invocation with cudaPrintfInit and cudaPrintfEnd
- Ensure that the number of characters is small enough to work with the default buffer size
- Ensure that cudaPrintfInit and cudaPrintfDisplay return cudaSuccess
My program uses the following in addition to the regular stuff:
- CUBLAS library
- page-locked (pinned) + mapped memory
Why isn't the call to cuPrintf doing anything?
Edit
Here are some relevant snippets from the code:
__global__ void swap_rows(float *d_A, int r1, int r2, int n)
{
int i = r1;
int j = blockDim.x*blockIdx.x + threadIdx.x;
cuPrintf("(%d,%d) ", i, j);
if(j >= n) return;
float tmp;
tmp = d_A[L(i,j)];
d_A[L(i,j)] = d_A[L(r2,j)];
d_A[L(r2,j)] = tmp;
}
extern "C" float *someFunction(float *_A, float *_b, int n)
{
int i, i_max, k, n2 = n*n;
dim3 lblock_size(32,1);
dim3 lgrid_size(n/lblock_size.x + 1, 1);
float *d_A, *d_b, *d_x, *h_A, *h_b, *h_x, tmp, dotpdt;
cublasStatus status;
cudaError_t ret;
if((ret = cudaSetDeviceFlags(cudaDeviceMapHost)) != cudaSuccess) {
fprintf(stderr, "Error setting device flag: %s\n",
cudaGetErrorString(ret));
return NULL;
}
// Allocate mem for A and copy data
if((ret = cudaHostAlloc((void **)&h_A, n2 * sizeof(float),
cudaHostAllocMapped)) != cudaSuccess) {
fprintf(stderr, "Error allocating page-locked h_A: %s\n",
cudaGetErrorString(ret));
return NULL;
}
if((ret = cudaHostGetDevicePointer((void **)&d_A, h_A, 0)) != cudaSuccess) {
fprintf(stderr, "Error getting devptr for page-locked h_A: %s\n",
cudaGetErrorString(ret));
return NULL;
}
if((ret = cudaMemcpy(h_A, _A, n2 * sizeof(float), cudaMemcpyHostToHost)) !=
cudaSuccess) {
fprintf(stderr, "Error copying A into h_A: %s\n", cudaGetErrorString(ret));
return NULL;
}
// Some code to compute k and i_max
if(cudaPrintfInit() != cudaSuccess)
printf("cudaPrintfInit failed\n");
swap_rows<<<lgrid_size,lblock_size>>>(d_A, k, i_max, n);
if((ret = cudaThreadSynchronize()) != cudaSuccess)
fprintf(stderr, "Synchronize failed!\n", cudaGetErrorString(ret));
if(cudaPrintfDisplay(stdout, true) != cudaSuccess)
printf("cudaPrintfDisplay failed\n");
cudaPrintfEnd();
// Some more code
}
I forgot to mention: these methods are compiled separately (from the main() function) as a dynamically linked module (shared object).