I have found the following hello world program for CUDA:
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
const int N = 16;
const int blocksize = 16;
__global__
void hello(char *a, int *b)
{
a[threadIdx.x] += b[threadIdx.x];
}
int main()
{
char a[N] = "Hello \0\0\0\0\0\0";
int b[N] = {15, 10, 6, 0, -11, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
char *ad;
int *bd;
const int csize = N*sizeof(char);
const int isize = N*sizeof(int);
printf("%s", a);
cudaMalloc( (void**)&ad, csize );
cudaMalloc( (void**)&bd, isize );
cudaCheckErrors("cudaMalloc fail");
cudaMemcpy( ad, a, csize, cudaMemcpyHostToDevice );
cudaMemcpy( bd, b, isize, cudaMemcpyHostToDevice );
cudaCheckErrors("cudaMemcpy H2D fail");
dim3 dimBlock( blocksize, 1 );
dim3 dimGrid( 1, 1 );
hello<<<dimGrid, dimBlock>>>(ad, bd);
cudaCheckErrors("Kernel fail");
cudaMemcpy( a, ad, csize, cudaMemcpyDeviceToHost );
cudaCheckErrors("cudaMemcpy D2H/Kernel fail");
cudaFree( ad );
cudaFree( bd );
printf("%s\n", a);
return EXIT_SUCCESS;
}
I compile it successfully with nvcc hello_world.cu -o hello
, but when I run cuda-memcheck ./hello
, I get:
========= CUDA-MEMCHECK
Fatal error: cudaMalloc fail (unknown error at hello_world.cu:39)
*** FAILED - ABORTING
Hello ========= ERROR SUMMARY: 0 errors
I'm a CUDA newbie, my questions are:
1) what's going on under the hood?
2) how can I fix it?
I'm running Ubuntu 13.04, x86_64, Cuda 5.5, without root access.
the upper output of nvidia-smi
is:
+------------------------------------------------------+
| NVIDIA-SMI 337.19 Driver Version: 337.19 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX TIT... Off | 0000:05:00.0 N/A | N/A |
| 26% 37C N/A N/A / N/A | 53MiB / 6143MiB | N/A Default |
+-------------------------------+----------------------+----------------------+
When I run deviceQuery
, I get:
../../bin/x86_64/linux/release/deviceQuery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
cudaGetDeviceCount returned 30
-> unknown error
Result = FAIL
And when I run deviceQueryDrv
, I get:
../../bin/x86_64/linux/release/deviceQueryDrv Starting...
CUDA Device Query (Driver API) statically linked version
cuInit(0) returned 999
-> CUDA_ERROR_UNKNOWN
Result = FAIL
When I run:
#include <cublas_v2.h>
#include <cstdio>
int main()
{
int res;
cublasHandle_t handle;
res = cublasCreate(&handle);
switch(res) {
case CUBLAS_STATUS_SUCCESS:
printf("the initialization succeeded\n");
break;
case CUBLAS_STATUS_NOT_INITIALIZED:
printf("the CUDA Runtime initialization failed\n");
break;
case CUBLAS_STATUS_ALLOC_FAILED:
printf("the resources could not be allocated\n");
break;
}
return 0;
}
I get the CUDA Runtime initialization failed
.