CUDA Mandelbrot set

Question

This is a sequential Mandelbrot Set implementation.

 void mandelbrot(PGMData *I)
{
    float x0,y0,x,y,xtemp;
    int i,j;
    int color;
    int iter;
    int MAX_ITER=1000;  
    for(i=0; i<I->height; i++)
        for(j=0; j<I->width; j++)
        {
            x0 = (float)j/I->width*(float)3.5-(float)2.5; 
            y0 = (float)i/I->height*(float)2.0-(float)1.0;
            x = 0;
            y = 0;
            iter = 0;
            while((x*x-y*y <= 4) && (iter < MAX_ITER))
            { 
                xtemp = x*x-y*y+x0;
                y = 2*x*y+y0;
                x = xtemp;
                iter++;
            }
            color = (int)(iter/(float)MAX_ITER*(float)I->max_gray);
            I->image[i*I->width+j] = I->max_gray-color;
        }
}

I want to paralellize it using CUDA but I seem to have misunderstood something and now I'm stuck. I've tried searching the internet but nothing really great came up.

Kernel:

__global__ void calc(int *pos)
{
    int row= blockIdx.y * blockDim.y + threadIdx.y;  // WIDTH
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // HEIGHT
    int idx = row * WIDTH + col;

    if(col > WIDTH || row > HEIGHT || idx > N) return;

    float x0 = (float)row/WIDTH*(float)3.5-(float)2.5;
    float y0 = (float)col/HEIGHT*(float)2.0-(float)1.0; 

    int x = 0, y = 0, iter = 0, xtemp = 0;
    while((x*x-y*y <= 4) && (iter < MAX_ITER))
    { 
        xtemp = x*x-y*y+x0;
        y = 2*x*y+y0;
        x = xtemp;
        iter++;
    }
    int color = 255 - (int)(iter/(float)MAX_ITER*(float)255);
    __syncthreads();
    pos[idx] = color;//color;// - color;

}

The kernel is initiated this way:

dim3 block_size(16, 16);
dim3 grid_size((N)/block_size.x, (int) N / block_size.y);
calc<<<grid_size,block_size>>>(d_pgmData);

And here are the constants:

#define HEIGHT 512
#define WIDTH 512   
#define N (HEIGHT*WIDTH)

The whole GPU function

void mandelbrotGPU(PGMData *I)
{
    int *pos = (int *)malloc(HEIGHT*WIDTH*sizeof(int));
    int *d_pgmData;

    cudaMalloc((void **)&d_pgmData, sizeof(int)*WIDTH*HEIGHT);


    cudaMemcpy(d_pgmData, pos ,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyHostToDevice);

    dim3 block_size(16, 16);
    dim3 grid_size((N)/block_size.x, (int) N / block_size.y);
    calc<<<grid_size,block_size>>>(d_pgmData);

    cudaMemcpy(pos,d_pgmData,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyDeviceToHost);
    cudaFree(d_pgmData);
    I->image = pos;
}

The problem is: It's either returning garbage or the driver crashes. I would really appreciate some advice because I am really stuck.

Roger Dahl · Accepted Answer · 2013-11-26T05:48:47.830

Here's a working version of your code (using OpenCV):

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>

using namespace cv;
using namespace std;

#define HEIGHT 512 // must be multiple of block_size.y
#define WIDTH 512 // must be multiple of block_size.x
#define MAX_ITER 10000

void mandelbrotGPU(char*);
__global__ void calc(char* image_buffer);

#define cudaAssertSuccess(ans) { _cudaAssertSuccess((ans), __FILE__, __LINE__); }
inline void _cudaAssertSuccess(cudaError_t code, char *file, int line)
{
  if (code != cudaSuccess)  {
    fprintf(stderr,"_cudaAssertSuccess: %s %s %d\n", cudaGetErrorString(code), file, line);
    exit(code);
  }
}

int main(int argc, char** argv)
{
  IplImage* image_output = cvCreateImage(cvSize(WIDTH, HEIGHT), IPL_DEPTH_8U, 1);
  mandelbrotGPU(image_output->imageData);
  cvShowImage("GPU", image_output);
  waitKey(0);
  cvReleaseImage(&image_output);
}

void mandelbrotGPU(char* image_buffer)
{
  char* d_image_buffer;
  cudaAssertSuccess(cudaMalloc(&d_image_buffer, WIDTH * HEIGHT));
  dim3 block_size(16, 16);
  dim3 grid_size(WIDTH / block_size.x, HEIGHT / block_size.y);
  calc<<<grid_size, block_size>>>(d_image_buffer);
  cudaAssertSuccess(cudaPeekAtLastError());
  cudaAssertSuccess(cudaDeviceSynchronize());
  cudaAssertSuccess(cudaMemcpy(image_buffer, d_image_buffer, HEIGHT * WIDTH, cudaMemcpyDeviceToHost));
  cudaAssertSuccess(cudaFree(d_image_buffer));
}

__global__ void calc(char* image_buffer)
{
  int row = blockIdx.y * blockDim.y + threadIdx.y;  // WIDTH
  int col = blockIdx.x * blockDim.x + threadIdx.x;  // HEIGHT
  int idx = row * WIDTH + col;
  if(col >= WIDTH || row >= HEIGHT) return;

  float x0 = ((float)col / WIDTH) * 3.5f - 2.5f;
  float y0 = ((float)row / HEIGHT) * 3.5f - 1.75f;

  float x = 0.0f;
  float y = 0.0f;
  int iter = 0;
  float xtemp;
  while((x * x + y * y <= 4.0f) && (iter < MAX_ITER))
  { 
    xtemp = x * x - y * y + x0;
    y = 2.0f * x * y + y0;
    x = xtemp;
    iter++;
  }

  int color = iter * 5;
  if (color >= 256) color = 0;
  image_buffer[idx] = color;
}

Output:

Mandelbrot output

The most important changes:

Removed __syncthreads();. This algorithm doesn't use the data generated by other threads, so there's no need to synchronize the threads.
Removed the copying of the host buffer over to the device. It's not necessary since the Mandelbrot algorithm writes the entire device buffer.
Fixed incorrect grid size calculation.
Removed the malloc of host memory because the result is copied directly into the OpenCV image buffer.
Changed the buffers to using bytes instead of ints, which is more convenient when you have a single gray channel with 8-bit resolution.
Removed some uneccessary float casts. When you use integers in calculations together with floats, the integers get automatically promoted to floats.
Fixed two issues in the Mandelbrot algorithm:
- x and y were declared as ints while they should be floats.
- The first expresssion in the while loop should contain a +, not a -.

Hi, could you please post the commands/tools that you used to build the project? I would really appreciate it. Thanks! — user3009269, Feb 15 '18 at 12:38
whats this?: ``#include "device_launch_parameters.h" what does it contain? — BitTickler, Feb 14 '20 at 10:07
@BitTickler Looks like it holds the declarations for blockIdx, blockDim, threadIdx, and gridDim; — Roger Dahl, Feb 15 '20 at 06:11

score 3 · Answer 2 · edited May 23 '17 at 12:22

This is certainly not correct:

    dim3 grid_size((N)/block_size.x, (int) N / block_size.y);

This is causing out-of-bounds accesses in your kernel. You want to launch WIDTH x HEIGHT threads in total, one for each pixel in your image. Instead you are launching N/16 x N/16 threads.

And it seems like you have a thread check line in your kernel (which should have prevented out-of-bounds accesses from errant threads), but it's not formulated correctly:

if(col > WIDTH || row > HEIGHT || idx > N) return;

For example this allows idx = N to pass the thread check, but this is not a valid memory location when written by the final line of the kernel:

pos[idx] = color;

You could fix this thread check with:

if(col >= WIDTH || row >= HEIGHT || idx >= N) return;

A few other comments:

you're not doing any proper cuda error checking
you can see the error in your code if you run it with cuda-memcheck

Apprentice Queue · Answer 3 · 2013-11-26T01:00:49.110

0

Just some ideas to look at:

There is no need to __syncthreads(). Your threads in a block don't communicate with each other.
There is no need to create device memory for I_WIDTH and I_HEIGHT. You just pass them in as values (instead of as pointers or reference). You do need need device memory for pos.
You need to check for the return value of all your CUDA functions (eg, cudaMalloc) and make sure it is all ok.
When you launch a kernel, your program can return before your GPU finishes. There are situations where you will need to wait explicitly for completion; you can do this by calling cudaDeviceSynchronize() after launching. In your case, you do not have to because your CUDA memcpy will wait until the kernel is completed.

edited Nov 26 '13 at 01:00

answered Nov 26 '13 at 00:05

Apprentice Queue

2,036
13
13

Your item 4 isn't correct. CUDA operations issued to the same stream (they are all in the same default stream here) are automatically serialized. The `cudaMemcpy` operation will not begin until the `calc` kernel preceding it has finished. There is no need for a call to `cudaDeviceSynchronize()` after launching if the concern is waiting for the kernel to finish. – Robert Crovella Nov 26 '13 at 00:26

CUDA Mandelbrot set

3 Answers3