I'm trying to recreate one algorithm in cuda and one of steps is blur 3d image (in my case 344 x 288 x 176) and I want to use 3x3x3 Gaussian mask for it.
I use cuda 8 in visual studio community 2015 on machine with intel i7-5500U, GeForce gt840m 4GB and 8GB ram, and I'm struggling with this problem for few days now.
The issue is that when my program ends blurring function it seems to lost information about memory which is passed by pointer as argument to blurring function, it wont crash on function or cudaPeekAtLastError or cudaDeviceSynchronize, but when I run program from Nsight>Start CUDA debugging it works just fine, and if i place break point before this function and run it, cudaPeekAtLastError and cudaDeviceSynchronize step by step then it sometimes work sometimes it doesnt.
mem-check says:
========= Error: process didn't terminate successfully
========= The application may have hit an error when dereferencing Unified Memory from the host. Please rerun the application under a host debugger to catch such errors.
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaDeviceSynchronize.
I have similar operations in my code and all of them works fine, and i just cant see what is wrong with this. I'm not perfect cuda programmer, I kinda know c and still learning cuda and I'm aware that my English sucks too :)
so there is this function:
__global__ void gaussBlur(float *img,
float *gaussKernel,
int imgSize,
int kernelSize,
int *imgDims,
int *gaussKernelDims) {
float newVoxelValue = 0;
int temp_idx = 0;
int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
int img_z, img_y, img_x;
const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
const unsigned int startIdx = (x * imgDims[1] + y) * imgDims[0];
int start = startIdx;
int stop = startIdx + imgDims[0];
for (int idx = start; idx < stop; idx++) {
if (idx < imgSize) {
img_z = idx / (imgDims[0] * imgDims[1]);
temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
img_y = temp_idx / imgDims[0];
img_x = temp_idx % imgDims[0];
if (img_x > 1 && img_x < imgDims[0] - 2 &&
img_y > 1 && img_y < imgDims[1] - 2 &&
img_z > 1 && img_z < imgDims[2] - 2) {
newVoxelValue = 0;
for (int i = 0; i < kernelSize; i++) {
kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
kernel_y = temp_i / gaussKernelDims[0];
kernel_x = temp_i % gaussKernelDims[0];
kernel_x--;kernel_y--;kernel_z--;
indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
newVoxelValue += gaussKernel[i] *
img[indeksWartosciDoPobrania];
}
img[idx] = newVoxelValue;
}
}
}
}
and here is its call:
inline __device__ __host__ unsigned int UMIN(unsigned int a, unsigned int b)
{
return a < b ? a : b;
}
inline __device__ __host__ unsigned int PowTwoDivider(unsigned int n)
{
if (n == 0) return 0;
unsigned int divider = 1;
while ((n & divider) == 0) divider <<= 1;
return divider;
}
int main() {
float *out_image;
float *gaussKernel;
int *gaussKernelDims;
int *imgDims;
cudaMallocManaged((void **)&gaussKernelDims, 3 * sizeof(*gaussKernelDims)); //lets say its hard coded {344, 288, 176}
cudaMallocManaged((void **)&imgDims, 3 * sizeof(*imgDims));//lets say its hard coded {3, 3, 3}
std::ifstream file("image.bin", std::ios::binary);
if (!file.is_open()) {
std::cout << "\n\nNie udalo sie otworzyc pliku obrazu";
return -1;
}
char * memblock;
file.seekg(0, std::ios::end);
int sizeOfFile = file.tellg();
memblock = new char[sizeOfFile];
file.seekg(0, std::ios::beg);
file.read(memblock, sizeOfFile);
file.close();
size = sizeOfFile / sizeof(*out_image);
cudaMallocManaged((void **)&out_image, size * sizeof(*out_image));
memcpy(out_image, memblock, sizeOfFile);
free(memblock);
//gaussKernel is loaded same way and it works for sure
unsigned int dimX = UMIN(UMIN(PowTwoDivider(imgDims[0]), PowTwoDivider(imgDims[1])), 64);
unsigned int dimY = UMIN(UMIN(PowTwoDivider(imgDims[2]), PowTwoDivider(imgDims[1])), 512 / dimX);
dim3 dimBlock(dimX, dimY);
dim3 dimGridX(imgDims[1] / dimBlock.x, imgDims[2] / dimBlock.y);
gaussBlur<<< dimGridX, dimBlock >>>(out_image, gaussKernel, size, gaussKernelSize, imgDims, gaussKernelDims);
cudaPeekAtLastError();
cudaDeviceSynchronize();
//if i try to access out_image here program will crush
}
I tried to save blurring result in other variable (other pointer passed as argument same way as image) but have same issue so i doubt it will be memory race and since it's only 3x3x3 I don't think if taking value before or after another thread do so will be significant. When for test I copy that code to cpu version and it works perfectly fine:
__host__ void gaussBlurCPU(float *img, float *gaussKernel, int imgSize, int kernelSize, int *imgDims, int *gaussKernelDims) {
float newVoxelValue = 0;
int temp_idx = 0;
int kernel_x, kernel_y, kernel_z, temp_i, indeksWartosciDoPobrania = 0;
int img_z, img_y, img_x;
for (int idx = 0; idx < imgSize; idx++) {
if (idx < imgSize) {
img_z = idx / (imgDims[0] * imgDims[1]);
temp_idx = idx - (img_z * imgDims[0] * imgDims[1]);
img_y = temp_idx / imgDims[0];
img_x = temp_idx % imgDims[0];
if (img_x > 1 && img_x < imgDims[0] - 2 &&
img_y > 1 && img_y < imgDims[1] - 2 &&
img_z > 1 && img_z < imgDims[2] - 2) {
newVoxelValue = 0;
for (int i = 0; i < kernelSize; i++) {
kernel_z = i / (gaussKernelDims[0] * gaussKernelDims[1]);
temp_i = i - (kernel_z * gaussKernelDims[0] * gaussKernelDims[1]);
kernel_y = temp_i / gaussKernelDims[0];
kernel_x = temp_i % gaussKernelDims[0];
kernel_x--;kernel_y--;kernel_z--;
indeksWartosciDoPobrania = ((img_z + kernel_z) * imgDims[0] * imgDims[1]) + ((img_y + kernel_y) * imgDims[0]) + (img_x + kernel_x);
newVoxelValue += gaussKernel[i] * img[indeksWartosciDoPobrania];
}
img[idx] = newVoxelValue;
}
}
}
}
Image is binary saved file from matlab (read in c++ then modified and saved works perfectly fine back in matlab) values in range 0.0f:~900.0f. GaussKernel is binary saved file from matlab (read in c++ then modified and saved works perfectly fine back in matlab) summ of all values is equal to 1.
To be precise i copy/paste gaussBlur() here but main is copied from few places in my code so i propably missed something
thanks for all advices in advance, best regards, Wojciech Serafin
EDIT: As Robert Crovella said i comments WDDM TDR timeout was reason for my code act like this. Sorry for bothering You, I might make to little research in this area before I asked.