Why does this CUDA code for calculating a Mandelbrot set fail when setting the maximum iteration count higher than 5,500,000?

Question

I'm writing a code synthesizer which converts high-level models into CUDA C code. As test model, I'm using a Mandelbrot generator application which executes the iteration count for each X-Y coordinate in parallel on a GPGPU. The image is 70x70 pixels, and the X-Y coordinates range from (-1, -1) to (1, 1). For simplicity, the application expects a large float array, where each group of 3 elements contains the X and Y coordinates, followed by the maximum iteration count. Each thread on the GPGPU receives a pointer to the beginning of each 3-group set and calculates the iteration count.

The synthesized CUDA code works perfectly when maximum iteration counts is less than 5,500,000, but when it goes higher than that then the output becomes completely bogus. To illustrate, see the examples below:

Normal output when `max_it` is set to 5,000,000:

output[0]: 3
output[1]: 3
output[2]: 3
output[3]: 3
output[4]: 3
output[5]: 3
output[6]: 3
output[7]: 3
output[8]: 3
output[9]: 4
output[10]: 4
output[11]: 4
output[12]: 4
output[13]: 4
output[14]: 4
output[15]: 5
output[16]: 5
output[17]: 5
output[18]: 5
output[19]: 5
output[20]: 6
output[21]: 7
output[22]: 9
output[23]: 11
output[24]: 19
output[25]: 5000000
output[26]: 5000000
output[27]: 5000000
...
output[4878]: 2
output[4879]: 2
output[4880]: 2
output[4881]: 2
output[4882]: 2
output[4883]: 2
output[4884]: 2
output[4885]: 2
output[4886]: 2
output[4887]: 2
output[4888]: 2
output[4889]: 2
output[4890]: 2
output[4891]: 2
output[4892]: 2
output[4893]: 2
output[4894]: 2
output[4895]: 2
output[4896]: 2
output[4897]: 2
output[4898]: 2
output[4899]: 2

Bogus output when `max_it` is set to 6,000,000:

output[0]: 0
output[1]: 0
output[2]: 0
output[3]: 0
output[4]: 0
output[5]: 0
output[6]: 0
output[7]: 0
output[8]: 0
output[9]: 0
output[10]: 0
output[11]: 0
output[12]: 0
output[13]: 0
output[14]: 0
output[15]: 0
output[16]: 0
output[17]: 0
output[18]: 0
output[19]: 0
output[20]: 0
output[21]: 0
output[22]: 0
output[23]: 0
output[24]: 0
output[25]: 0
output[26]: 0
output[27]: 0
...
output[4877]: 0
output[4878]: -1161699328
output[4879]: 32649
output[4880]: -1698402160
output[4881]: 32767
output[4882]: -1177507963
output[4883]: 32649
output[4884]: 6431616
output[4885]: 0
output[4886]: -1174325376
output[4887]: 32649
output[4888]: -1698402384
output[4889]: 32767
output[4890]: 4199904
output[4891]: 0
output[4892]: -1698402160
output[4893]: 32767
output[4894]: -1177511704
output[4895]: 32649
output[4896]: -1174325376
output[4897]: 32649
output[4898]: -1177559142
output[4899]: 32649

And here follows the code:

mandelbrot.cpp (main file)

#include "mandelbrot.h"
#include <iostream>
#include <cstdlib>

using namespace std;

int main(int argc, char** argv) {
    const int kNumPixelsRow = 70;
    const int kNumPixelsCol = 70;

    if (argc != 6) {
        cout << "Must provide 5 arguments: " << endl
             << "   #1: Lower left corner X coordinate (x0)" << endl
             << "   #2: Lower left corner Y coordinate (y0)" << endl
             << "   #3: Upper right corner X coordinate (x1)" << endl
             << "   #4: Upper right corner Y coordinate (y1)" << endl
             << "   #5: Maximum number of iterations" << endl;
        return 0;
    }

    float x0 = (float) atof(argv[1]);
    if (x0 < -2.5) {
        cout << "x0 is too small, must be larger than -2.5" << endl;
        return 0;
    }
    float y0 = (float) atof(argv[2]);
    if (y0 < -1) {
        cout << "y0 is too small, must be larger than -1" << endl;
        return 0;
    }
    float x1 = (float) atof(argv[3]);
    if (x1 > 1) {
        cout << "x1 is too large, must be smaller than 1" << endl;
        return 0;
    }
    float y1 = (float) atof(argv[4]);
    if (y1 > 1) {
        cout << "x0 is too large, must be smaller than 1" << endl;
        return 0;
    }
    int max_it = atoi(argv[5]);
    if (max_it <= 0) {
        cout << "max_it is too small, must be larger than 0" << endl;
        return 0;
    }

    cout << "Generating input data..." << endl;
    float input_array[kNumPixelsRow][kNumPixelsCol][3];
    float delta_x = (x1 - x0) / kNumPixelsRow;
    float delta_y = (y1 - y0) / kNumPixelsCol;
    for (int x = 0; x < kNumPixelsCol; ++x) {
        for (int y = 0; y < kNumPixelsRow; ++y) {
            if (x == 0) {
                input_array[x][y][0] = x0;
            }
            else {
                input_array[x][y][0] = input_array[x - 1][y][0] + delta_x;
            }
            if (y == 0) {
                input_array[x][y][1] = y0;
            }
            else {
                input_array[x][y][1] = input_array[x][y - 1][1] + delta_y;
            }
            input_array[x][y][2] = (float) max_it;
        }
    }

    cout << "Executing..." << endl;
    struct ModelOutput output = executeModel((float*) input_array);
    cout << "Done." << endl;

    for (int i = 0; i < kNumPixelsRow * kNumPixelsCol; ++i) {
        cout << "output[" << i << "]: " << output.value1[i] << endl;
    }

    return 0;
}

mandelbrot.h (header file)

////////////////////////////////////////////////////////////
// AUTO-GENERATED BY f2cc 0.1
////////////////////////////////////////////////////////////

/**
 * C struct for retrieving the output values from the model. 
 * This is needed since C functions can only return a single 
 * value.
*/
struct ModelOutput {
    /**
     * Output from process "parallelmapSY_1".
     */
    int value1[4900];
};

/**
 * Executes the model.
 *
 * @param input1
 *        Input to process "parallelmapSY_1".
 *        Expects an array of size 14700.
 * @returns A struct containing the model outputs.
 */

struct ModelOutput executeModel(const float* input1);

mandelbrot.cu (CUDA file)

////////////////////////////////////////////////////////////
// AUTO-GENERATED BY f2cc 0.1
////////////////////////////////////////////////////////////

#include "mandelbrot.h"

__device__
int parallelmapSY_1_func1(const float* args) {
          float x0 = args[0];
          float y0 = args[1];
          int max_it = (int) args[2];
          float x = 0;
          float y = 0;
          int i = 0;
          while (x*x + y*y < (2*2) && i < max_it) {
            float x_temp = x*x - y*y + x0;
            y = 2*x*y + y0;
            x = x_temp;
            ++i;
          }
          return i;
        }


__global__
void parallelmapSY_1__kernel(const float* input, int* output) {
    unsigned int index = (blockIdx.x * blockDim.x + threadIdx.x);
    if (index < 4900) {
        output[index] = parallelmapSY_1_func1(&input[index * 3]);
    }
}

void parallelmapSY_1__kernel_wrapper(const float* input, int* output) {
    float* device_input;
    int* device_output;
    struct cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int max_block_size = prop.maxThreadsPerBlock;
    int num_blocks = (4900 + max_block_size - 1) / max_block_size;
    cudaMalloc((void**) &device_input, 14700 * sizeof(float));
    cudaMalloc((void**) &device_output, 4900 * sizeof(int));
    cudaMemcpy((void*) device_input, (void*) input, 14700 * sizeof(float), cudaMemcpyHostToDevice);
    dim3 grid(num_blocks, 1);
    dim3 blocks(max_block_size, 1);
    parallelmapSY_1__kernel<<<grid, blocks>>>(device_input, device_output);
    cudaMemcpy((void*) output, (void*) device_output, 4900 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree((void*) device_input);
    cudaFree(((void*) device_output);
}


struct ModelOutput executeModel(const float* input1) {
    // Declare signal variables
    // Signals part of DelaySY processes are also initiated with delay value
    float model_input_to_parallelmapSY_1_in[14700];
    int parallelmapSY_1_out_to_model_output[4900];

    // Copy model inputs to signal variables
    for (int i = 0; i < 14700; ++i) {
        model_input_to_parallelmapSY_1_in[i] = input1[i];
    }

    // Execute processes
    parallelmapSY_1__kernel_wrapper(model_input_to_parallelmapSY_1_in, parallelmapSY_1_out_to_model_output);

    // Copy model output values to return container
    struct ModelOutput outputs;
    for (int i = 0; i < 4900; ++i) {
        outputs.value1[i] = parallelmapSY_1_out_to_model_output[i];
    }
    return outputs;
}

The interesting file is mandelbrot.cu as that contains the computational code; mandelbrot.cpp is just a driver to get user input and generate input data, and mandelbrot.h is just a header file so that mandelbrot.cpp can easily use mandelbrot.cu.

The function executeModel() is a wrapper function which takes care of propagating data between the processes in the model. In this case there is only one process so executeModel() is rather pointless.

parallelmapSY_1__kernel_wrapper() prepares the parallel execution by allocating memory on the device, transfers the input data, invokes the kernel, and transfers the result back to the host.

parallelmapSY_1__kernel() is the kernel function, which simply calls parallelmapSY_1_func1() with the appropriate input data. It also prevents execution when too many threads have been spawned.

So the real area of interest is parallelmapSY_1_func1(). As I said, it works perfectly when the maximum iteration count is less than 5,500,000, but when I go higher it just doesn't seem to work as it's supposed to (see output log above). Some may ask "Why are you setting the iteration count so high? That's not necessary!". True, but since the pure C equivalent works perfectly with higher maximum iteration counts, why shouldn't the CUDA version? Since I'm designing a general tool, I need to know why it doesn't work in this example.

So does anyone have any idea what the code appears to fail when the maximum iteration count fails when exceeding 5,500,000?

What OS are you using, and do you have a display attached to the GPU card that you are using for CUDA ? — Paul R, Nov 30 '11 at 12:58
Can you add what card, operating system and CUDA version you are using? — talonmies, Nov 30 '11 at 12:59
The machine runs Ubuntu 10.04, and we are a bit unsure whether the card is a GeForce GT 240 or a Quadro NVS 290. — gablin, Nov 30 '11 at 13:35

score 6 · Accepted Answer · edited May 23 '17 at 10:29

6

It may be a time-out problem with your video card and the OS causing the CUDA task to be aborted. See e.g. CUDA apps time out & fail after several seconds - how to work around this?

edited May 23 '17 at 10:29

Community

1
1

answered Nov 30 '11 at 12:57

Paul R

208,748
37
389
560

That would make sense as the code _sometimes_ work with 5,500,000 and sometimes not. This sounds promising. – gablin Nov 30 '11 at 13:36
1

Yes, this was indeed the case (just forgot to accept the answer). – gablin Jan 01 '12 at 19:42

Why does this CUDA code for calculating a Mandelbrot set fail when setting the maximum iteration count higher than 5,500,000?

Normal output when max_it is set to 5,000,000:

Bogus output when max_it is set to 6,000,000:

mandelbrot.cpp (main file)

mandelbrot.h (header file)

mandelbrot.cu (CUDA file)

1 Answers1

Normal output when `max_it` is set to 5,000,000:

Bogus output when `max_it` is set to 6,000,000: