Cuda - Device values 0 after kernel execution

Question

For some reason when I execute my program the device variables have a zero values. Just before I execute the cuda kernel the device variables have the correct values. The output image is just black of the original image size. All the memory allocations and copying to and from host seem to be correct.

Thanks for any help!

    // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#ifdef _WIN32
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif

#define Image_Size 512
#define Kernel_Size 3

// Includes CUDA
#include <cuda_runtime.h>

// Utilities and timing functions
#include "./inc/helper_functions.h"    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include "./inc/helper_cuda.h"         // helper functions for CUDA error check

const char *imageFilename = "lena_bw.pgm";

const char *sampleName = "simpleTexture";

#define C_PI 3.141592653589793238462643383279502884197169399375

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
    // This function effectively swirls an image
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
    // Each thread on the GPU will process exactly one pixel
    // Before doing anything, we need to determine the current pixel we are calculating in this thread
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    // Test to see if we're testing a valid pixel
    if (i >= height || j >= width) return;  // Don't bother doing the calculation. We're not in a valid pixel location

    double cX = (double)width/2.0f;
    double cY = (double)height/2.0f;
    double relY = cY-i;
    double relX = j-cX;
    // relX and relY are points in our UV space
    // Calculate the angle our points are relative to UV origin. Everything is in radians.
    double originalAngle;
    if (relX != 0)
    {
        originalAngle = atan(abs(relY)/abs(relX));
        if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
        else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
        else if (relX <=0 && relY <0) originalAngle += C_PI;
    }
    else
    {
        // Take care of rare special case
        if (relY >= 0) originalAngle = 0.5f * C_PI;
        else originalAngle = 1.5f * C_PI;
    }
    // Calculate the distance from the center of the UV using pythagorean distance
    double radius = sqrt(relX*relX + relY*relY);
    // Use any equation we want to determine how much to rotate image by
    //double newAngle = originalAngle + factor*radius;  // a progressive twist
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
    // Transform source UV coordinates back into bitmap coordinates
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
    srcX += cX;
    srcY += cY;
    srcY = height - srcY;
    // Clamp the source to legal image pixel
    if (srcX < 0) srcX = 0;
    else if (srcX >= width) srcX = width-1;
    if (srcY < 0) srcY = 0;
    else if (srcY >= height) srcY = height-1;
    // Set the pixel color
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}




////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("%s starting...\n", sampleName);

    // Process command-line arguments
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "input"))
        {
            getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
        }
        else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
        {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }

    runTest(argc, argv);

    cudaDeviceReset();
    printf("%s completed",
           sampleName);
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    int devID = findCudaDevice(argc, (const char **) argv);
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
    // load image from disk
    float *hData = NULL;
    float *host_array_kernel = 0;

    float *device_array_Image = 0;
    float *device_array_kernel = 0;
    float *device_array_Result = 0;


    unsigned int width, height;
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]);

    if (imagePath == NULL)
    {
        printf("Unable to source image file: %s\n", imageFilename);
        exit(EXIT_FAILURE);
    }

    sdkLoadPGM(imagePath, &hData, &width, &height);

    unsigned int size = width * height * sizeof(float);
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

    // Allocation of device arrays using CudaMalloc
    cudaMalloc((void**)&device_array_Image, size);
    cudaMalloc((void**)&device_array_kernel, kernel_bytes);
    cudaMalloc((void**)&device_array_Result, size);


    host_array_kernel = (float*)malloc(kernel_bytes); // kernel


   // Allocate mem for the result on host side
   float *hOutputDataSharp = (float *) malloc(size);

    GenerateKernel (host_array_kernel);


// copy arrays and kernel from host to device
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));



    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    //Do the Convolution
    printf("DImage : '%.8f'\n",device_array_Image);
    printf("DKernel : '%.8f'\n",device_array_kernel);
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp);


    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
    printf("DResult : '%.8f'\n",device_array_Result);
    checkCudaErrors(cudaDeviceSynchronize());
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
    printf("HResult : '%.8f'\n",hOutputDataSharp);
    // Write result to file
    char outputSharp[1024];

    strcpy(outputSharp, imagePath);
    strcpy(outputSharp, "data/serial_sharptest.pgm");
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height);

    cudaFree(device_array_Result);
    cudaFree(device_array_Image);
    cudaFree(device_array_kernel);
    free(hData);
    free(imagePath);
    //free(host_array_Image);
    free(host_array_kernel);
    free(hOutputDataSharp);
    //free(hOutputImage);
    //free(hOutputKernel);
}

Its not producing any errors. The image produced is just black instead of the processed image it's supposed to be. I am really new to cuda, essentially started a few days ago so im still coming to grips with understanding it. — Alpha, Apr 28 '16 at 12:45
@Alpha: How do you know it isn't producing errors if there is no error checking in the code? — talonmies, Apr 28 '16 at 12:58
@talonmies: Like I said I'm new to cuda so I've just been printing out the output values at different steps within the code to see why its producing a black image and the only thing I notice is that when copying back from device to host all values from the device are 0. — Alpha, Apr 28 '16 at 13:07
See this answer on error checking : http://stackoverflow.com/a/14038590/6172231 You need to check for errors at each call related to the API and your kernels, as you could retrieve an error from a previous function if you don't check for all of them. — Taro, Apr 28 '16 at 13:09
@Taro: Thanks! That was really helpful. After adding in the error checking I receive the error "an illegal memory access was encountered" just after the execution of the kernel, at the cudaDeviceSynchronize(). — Alpha, Apr 28 '16 at 14:38
This means, if you did error checking for previous CUDA-related calls, that the failure really occurs during the kernel execution. You might access memory at a non-correct address (for example, you could be out of your allocated array's bounds). I suggest you carefully check where and how you access data. — Taro, Apr 28 '16 at 14:41
The method described [here](http://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218) can help you narrow the out-of-bounds access down to a particular line of code in your kernel. — Robert Crovella, Apr 29 '16 at 01:43

score 1 · Answer 1 · answered Apr 28 '16 at 20:48

Your code is writing in the source image:

pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];

which writes to device_array_Image which is the source, not the destination you are expecting results in.

Moreover, I am very curious on the output of printf("DResult : '%.8f'\n",device_array_Result); as device_array_Result is in GPU address space and allocated with cudaMalloc. On which device are you running ?

Cuda - Device values 0 after kernel execution

1 Answers1