nvprof profiles The API just fine. But says No kernels were profiled. It shows these 2 warning messages " ==525867== Warning: 4 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size. ==525867== Warning: 1 records have invalid timestamps due to insufficient semaphore pool size. You can configure the pool size using the option --profiling-semaphore-pool-size. ==525867== Profiling result: No kernels were profiled." I am using NVIDIA GeForce GPU.
#include <stdio.h>
#include <cuda.h>
#include <time.h>
#include <cuda_profiler_api.h>
__global__ void matrixInit(float *m, int N_1, int N_2, int value){
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int strideX = blockDim.x * gridDim.x;
unsigned int strideY = blockDim.y * gridDim.y;
for(int j=iy; j<N_2; j+=strideY){
for(int i=ix; i<N_1; i+=strideX){
m[j*N_1+i] = value;
}
}
}
__global__ void matrixAdd(float *d_A, float *d_B, float *d_C, int N_1, int N_2){
// indexes and strides in 2d
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;
unsigned int strideX = blockDim.x * gridDim.x;
unsigned int strideY = blockDim.y * gridDim.y;
for(int j=iy; j<N_2; j+=strideY){
for(int i=ix; i<N_1; i+=strideX){
d_C[i] = d_A[j*N_1+i]+d_B[j*N_1+i];
}
}
}
int main() {
int N_1 = 1 << 12;
int N_2 = 1 << 15;
//Size
int N_1_2 = N_1 * N_2;
// host memory pointers
float *A, *B, *C;
// device memory pointers
float *d_A, *d_B, *d_C;
clock_t t = clock();
size_t bytes = N_1_2*sizeof(float);
// allocate host memory
A = (float*)malloc(bytes);
B = (float*)malloc(bytes);
C = (float*)malloc(bytes);
//set dimensions for 1d
int threadsPerBlock=32;
dim3 threads(threadsPerBlock,threadsPerBlock);
dim3 numBlocks( N_1/threads.x, N_2/threads.y);
printf(" Grid Size of X: %d Grid Size of Y: %d \n ",threads.x,threads.y);
//Initialize
matrixInit<<<numBlocks,threads>>>(A,N_1, N_2, 1.0f);
matrixInit<<<numBlocks,threads>>>(B,N_1, N_2, 2.0f);
matrixInit<<<numBlocks,threads>>>(C,N_1, N_2, 0.0f);
//allocated device memory
cudaMalloc(&d_A, bytes);
cudaMalloc(&d_B, bytes);
cudaMalloc(&d_C, bytes);
//copy to device
cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);
matrixAdd<<<numBlocks,threads>>>(d_A, d_B, d_C, N_1, N_2);
//copy back to host
cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
t = clock() - t;
printf("Program executed at %f seconds", ((float)t) / CLOCKS_PER_SEC);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaProfilerStop();
return 0;
}
Using Matrix addition in cuda c,code executes but when profiling it with nvprof.It says NO kernels profiled.