For some reason when I execute my program the device variables have a zero values. Just before I execute the cuda kernel the device variables have the correct values. The output image is just black of the original image size. All the memory allocations and copying to and from host seem to be correct.
Thanks for any help!
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
{
originalAngle = atan(abs(relY)/abs(relX));
if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
}
else
{
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
}
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
{
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
{
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
}
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
{
printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
}
runTest(argc, argv);
cudaDeviceReset();
printf("%s completed",
sampleName);
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
{
printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE);
}
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
cudaFree(device_array_Result);
cudaFree(device_array_Image);
cudaFree(device_array_kernel);
free(hData);
free(imagePath);
//free(host_array_Image);
free(host_array_kernel);
free(hOutputDataSharp);
//free(hOutputImage);
//free(hOutputKernel);
}