0

I've written the following code for generating a Julia set fractal in CUDA C/C++ with some help from online sources. I've been trying for hours now, but I'm unable to figure out as to why this always generates a grey image rather than the one I get when I run the CPU code. I'm new to CUDA C and parallel programming, and I'm currently referring to CUDA by Example by Sanders and Kandrot.

Here is the CPU variant of code, which runs fine with all the necessary imports in VS2013:

/*
 References:
 [1] http://stackoverflow.com/questions/23711681/generating-custom-color-palette-for-julia-set
 [2] http://www.cs.rit.edu/~ncs/color/t_convert.html
*/

#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>

using namespace std;

#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512

void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v );
void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed);
void compute_julia(complex<float> c, unsigned char * image);


int main(int argc, char **argv)
{
    complex<float> c(0.285f, 0.01f);
    if(argc > 2)
    {
        c.real(atof(argv[1]));
        c.imag(atof(argv[2]));
    } else
        fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);

    ilInit();
    unsigned char *image = new unsigned char[N*N*3]; //RGB image
    compute_julia(c, image);
    saveImage(N, N, image, c);
    delete[] image;
}

void compute_julia(complex<float> c, unsigned char * image)
{
    complex<float> z_old(0.0f, 0.0f);
    complex<float> z_new(0.0f, 0.0f);
    for(int y=0; y<N; y++)
        for(int x=0; x<N; x++)
        {
            z_new.real(4.0f * x / (N) - 2.0f);
            z_new.imag(4.0f * y / (N) - 2.0f);
            int i;
            for(i=0; i<MAX_ITER; i++)
            {
                z_old.real(z_new.real());
                z_old.imag(z_new.imag());
                z_new = pow(z_new, 2);
                z_new += c;
                if(norm(z_new) > 4.0f) break;
            }
            float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
            float hue = (i % MAX_ITER)/float(MAX_ITER - 1);
            hue = (120*sqrtf(hue) + 150);
            float r, g, b;
            HSVtoRGB(&r, &g, &b, hue, 1.0f, brightness);
            image[(x + y*N)*3 + 0] = (unsigned char)(b*255);
            image[(x + y*N)*3 + 1] = (unsigned char)(g*255);
            image[(x + y*N)*3 + 2] = (unsigned char)(r*255);
        }
}

void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed)
{
    ILuint imageID = ilGenImage();
    ilBindImage(imageID);
    ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
    //ilEnable(IL_FILE_OVERWRITE);
    char imageName[256];
    sprintf(imageName, "Julia %.3f + i%.3f.png", seed.real(), seed.imag());
    ilSave(IL_PNG, imageName);
    fprintf(stderr, "Image saved as: %s\n", imageName);
}

// r,g,b values are from 0 to 1
// h = [0,360], s = [0,1], v = [0,1]
//      if s == 0, then h = -1 (undefined)
void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v )
{
    int i;
    float f, p, q, t;
    if( s == 0 ) {
        // achromatic (grey)
        *r = *g = *b = v;
        return;
    }
    h /= 60;            // sector 0 to 5
    i = floor( h );
    f = h - i;          // factorial part of h
    p = v * ( 1 - s );
    q = v * ( 1 - s * f );
    t = v * ( 1 - s * ( 1 - f ) );
    switch( i ) {
        case 0:
            *r = v;
            *g = t;
            *b = p;
            break;
        case 1:
            *r = q;
            *g = v;
            *b = p;
            break;
        case 2:
            *r = p;
            *g = v;
            *b = t;
            break;
        case 3:
            *r = p;
            *g = q;
            *b = v;
            break;
        case 4:
            *r = t;
            *g = p;
            *b = v;
            break;
        default:        // case 5:
            *r = v;
            *g = p;
            *b = q;
            break;
    }
}

And here is the corresponding GPU version (note that it is pretty unrefined at the moment, I'll do that once I'm able to get basic functionality out of it):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>
/*
References:
[1] http://stackoverflow.com/questions/23711681/generating-custom-color-palette-for-julia-set
[2] http://www.cs.rit.edu/~ncs/color/t_convert.html
*/

using namespace std;

#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512

struct cuComplex {
    float r;
    float i;
    __host__ __device__ cuComplex(float a, float b) : r(a), i(b) {}
    __host__ __device__ float magnitude2(void) {
        return r * r + i * i;
    }
    __host__ __device__ cuComplex operator*(const cuComplex& a) {
        return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
    }
    __host__ __device__ cuComplex operator+(const cuComplex& a) {
        return cuComplex(r + a.r, i + a.i);
    }
};

void HSVtoRGB(float *r, float *g, float *b, float h, float s, float v);
void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed);
void compute_julia(complex<float> c, unsigned char * image);
__global__ void compute_julia_gpu(unsigned char* image);
__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v);

int main(int argc, char **argv)
{
    cuComplex c(-0.8f, 0.156f);
    /*
    if (argc > 2)
    {
        c.real(atof(argv[1]));
        c.imag(atof(argv[2]));
    }*/
    fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);

    ilInit();

    dim3 grid(N, N);
    unsigned char *image = new unsigned char[N*N * 3]; //RGB image

    size_t size = sizeof(image);

    unsigned char *d_image; //RGB image

    cudaMalloc((void **)&d_image, size);

    compute_julia_gpu<<<grid, 1>>>(d_image);

    cudaMemcpy(image, d_image, size, cudaMemcpyDeviceToHost);

    saveImage(N, N, image, c);

    cudaFree(d_image);
    delete[] image;
}

__global__ void compute_julia_gpu(unsigned char* image) {
    /*
    complex<float> z_old(0.0f, 0.0f);
    complex<float> z_new(0.0f, 0.0f);
    complex<float> c(-0.8f, 0.156f);
    */
    cuComplex z_old(0.0, 0.0);
    cuComplex z_new(0.0, 0.0);
    cuComplex c(-0.8f, 0.156f);
    int x = blockIdx.x;
    int y = blockIdx.y;
    z_new.r = (4.0f * x / (N)-2.0f);
    z_new.i = (4.0f * y / (N)-2.0f);
    int i = 0;
    for (i = 0; i<MAX_ITER; i++)
    {
        z_old.r = z_new.r;
        z_old.i = z_new.i;
        z_new = (z_new * z_new) + c;
        if (z_new.magnitude2() > 4.0f) break;
    }
    float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
    float hue = (i % MAX_ITER) / float(MAX_ITER - 1);
    hue = (120 * sqrtf(hue) + 150);
    float r, g, b;
    HSVtoRGB_GPU(&r, &g, &b, hue, 1.0f, brightness);
    image[(x + y*N) * 3 + 0] = (unsigned char)(b * 255);
    image[(x + y*N) * 3 + 1] = (unsigned char)(g * 255);
    image[(x + y*N) * 3 + 2] = (unsigned char)(r * 255);
}

void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed)
{
    ILuint imageID = ilGenImage();
    ilBindImage(imageID);
    ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
    //ilEnable(IL_FILE_OVERWRITE);
    char imageName[256];
    sprintf(imageName, "Julia %.3f + i%.3f.png", seed.r, seed.i);
    ilSave(IL_PNG, imageName);
    fprintf(stderr, "Image saved as: %s\n", imageName);
}

__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v)
{
    int i;
    float f, p, q, t;
    if (s == 0) {
        // achromatic (grey)
        *r = *g = *b = v;
        return;
    }
    h /= 60;            // sector 0 to 5
    i = floor(h);
    f = h - i;          // factorial part of h
    p = v * (1 - s);
    q = v * (1 - s * f);
    t = v * (1 - s * (1 - f));
    switch (i) {
    case 0:
        *r = v;
        *g = t;
        *b = p;
        break;
    case 1:
        *r = q;
        *g = v;
        *b = p;
        break;
    case 2:
        *r = p;
        *g = v;
        *b = t;
        break;
    case 3:
        *r = p;
        *g = q;
        *b = v;
        break;
    case 4:
        *r = t;
        *g = p;
        *b = v;
        break;
    default:        // case 5:
        *r = v;
        *g = p;
        *b = q;
        break;
    }
}

Any help is appreciated, thanks.

Mat
  • 202,337
  • 40
  • 393
  • 406
Tarun Verma
  • 329
  • 5
  • 17

1 Answers1

2

The problem is your size variable:

#include <iostream>
#include <string.h>
using namespace std;

#define N 1024

int main() {
    unsigned char *image = new unsigned char[N*N * 3]; //RGB image
    size_t size = sizeof(image);
    cout << size;
    return 0;
}

in this case the ouput is 4 (on a 32 bit architecture) because sizeof returns the size of the variable's type. In this case it is unsigned char * and is 4 byte long.

You can run cuda-memcheck ./yourExecuteable and you will see errors when your code performs out of bounds access to the GPU's global memory. You will see a lot of errors because you are allocating only 4 bytes of global memory for your d_image array.

Michael Haidl
  • 5,384
  • 25
  • 43
  • I see the mistake with my approach, however, is there any possible way to allot the memory for my `d_image` variable? I'm new to this, so I don't know how to allocate memory for an `unsigned char` image holder variable. I know I'm asking a lot, but please help me out on this if you can. – Tarun Verma Jan 18 '15 at 14:26
  • `cudaMalloc((void **)&d_image, sizeof(unsigned char) * N * N * 3);` – Michael Haidl Jan 18 '15 at 14:27
  • I've tried this already since it seemed like the obvious choice, but this crashes my GPU driver and gives me back the grey image. That's why I thought that I was doing something wrong again with memory allocation. – Tarun Verma Jan 18 '15 at 14:33
  • @TarunVerma this "crash" happens because the Windows WDDM Watchdog detects an unresponsive driver (your kernel needs to much time) and Windows restarts the driver. Your configuration with 1024x1024 blocks and 1 Thread in each block is a performance killer. http://stackoverflow.com/questions/27903225/opencl-read-variable-size-result-buffer-from-the-gpu/27903586#27903586 in this question the same problem was occures. Look at my answer and follow the instructions to disable the WDDM watchdog or get a better launch configuration may help you. – Michael Haidl Jan 19 '15 at 06:24