So I just started writing in CUDA, following the An Even Easier Introduction to CUDA guide. So far so good. Then I wanted to implement a neural network, which gave me quite some calls to the function cudaMallocManaged()
. So, to make things more readable, I decided to put these in a different function called allocateStuff() (see code below). When I then run the program using nvprof this does not show the GPU time of layerInit() and instead gives the following warning:
Warning: 1 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.
However, when I put the code in the allocateStuff() function directly in main(), the warning does not happen and the GPU time for layerInit() is shown. So now my question is: what do I do wrong in this function, or else what is the reason that it (apparently) overflows the buffer?
Code:
#include <cuda_profiler_api.h>
#include <iostream>
#include <vector>
__global__
void layerInit(const unsigned int firstNodes,
const unsigned int secondNodes,
const unsigned int resultNodes,
float *firstLayer,
float *secondLayer,
float *resultLayer) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (unsigned int i = index; i < firstNodes; i += stride) {
firstLayer[i] = 0.0f;
}
for (unsigned int i = index; i < secondNodes; i += stride) {
secondLayer[i] = 0.0f;
}
for (unsigned int i = index; i < resultNodes; i += stride) {
resultLayer[i] = 0.0f;
}
}
void allocateStuff(const unsigned int firstNodes,
const unsigned int secondNodes,
const unsigned int resultNodes,
float *firstLayer,
float *secondLayer,
float *resultLayer,
std::vector<float*> &firstWeightLayer,
std::vector<float*> &secondWeightLayer) {
cudaMallocManaged(&firstLayer, firstNodes * sizeof(float));
cudaMallocManaged(&secondLayer, secondNodes * sizeof(float));
cudaMallocManaged(&resultLayer, resultNodes * sizeof(float));
for (auto& nodeLayer : firstWeightLayer) {
cudaMallocManaged(&nodeLayer, secondNodes * sizeof(float));
}
for (auto& nodeLayer : secondWeightLayer) {
cudaMallocManaged(&nodeLayer, resultNodes * sizeof(float));
}
}
template<typename T, typename... Args>
void freeStuff(T *t) {
cudaFree(t);
}
template<typename T, typename... Args>
void freeStuff(T *t, Args... args) {
freeStuff(&t);
freeStuff(args...);
}
void freeStuff(std::vector<float*> &vec) {
for (auto& v : vec) {
freeStuff(&v);
}
}
int main () {
unsigned int firstNodes = 5, secondNodes = 3, resultNodes = 1;
float *firstLayer = new float[firstNodes];
float *secondLayer = new float[secondNodes];
float *resultLayer = new float[resultNodes];
std::vector<float*> firstWeightLayer(firstNodes, new float[secondNodes]);
std::vector<float*> secondWeightLayer(secondNodes, new float[resultNodes]);
allocateStuff(firstNodes, secondNodes, resultNodes,
firstLayer, secondLayer, resultLayer,
firstWeightLayer,secondWeightLayer);
layerInit<<<1,256>>>(firstNodes,
secondNodes,
resultNodes,
firstLayer,
secondLayer,
resultLayer);
cudaDeviceSynchronize();
freeStuff(firstLayer, secondLayer, resultLayer);
freeStuff(firstWeightLayer);
freeStuff(secondWeightLayer);
cudaProfilerStop();
return 0;
}
Output of nvprof ./executable
with function allocateStuff():
==18608== NVPROF is profiling process 18608, command: ./executable
==18608== Profiling application: ./executable
==18608== Warning: 1 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.
==18608== Profiling result:
No kernels were profiled.
Type Time(%) Time Calls Avg Min Max Name
API calls: 96.20% 105.47ms 11 9.5884ms 5.7630us 105.39ms cudaMallocManaged
...
Output of nvprof ./executable
without said function:
==18328== NVPROF is profiling process 18328, command: ./executable
==18328== Profiling application: ./executable
==18328== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 100.00% 2.2080us 1 2.2080us 2.2080us 2.2080us layerInit(unsigned int, unsigned int, unsigned int, float*, float*, float*)
API calls: 99.50% 114.01ms 11 10.365ms 4.9390us 113.95ms cudaMallocManaged
...
Compilercall: nvcc -std=c++11 -g -o executable main.cu