I'm writing a code synthesizer which converts high-level models into CUDA C code. As test model, I'm using a Mandelbrot generator application which executes the iteration count for each X-Y coordinate in parallel on a GPGPU. The image is 70x70 pixels, and the X-Y coordinates range from (-1, -1) to (1, 1). For simplicity, the application expects a large float
array, where each group of 3 elements contains the X and Y coordinates, followed by the maximum iteration count. Each thread on the GPGPU receives a pointer to the beginning of each 3-group set and calculates the iteration count.
The synthesized CUDA code works perfectly when maximum iteration counts is less than 5,500,000, but when it goes higher than that then the output becomes completely bogus. To illustrate, see the examples below:
Normal output when max_it
is set to 5,000,000:
output[0]: 3
output[1]: 3
output[2]: 3
output[3]: 3
output[4]: 3
output[5]: 3
output[6]: 3
output[7]: 3
output[8]: 3
output[9]: 4
output[10]: 4
output[11]: 4
output[12]: 4
output[13]: 4
output[14]: 4
output[15]: 5
output[16]: 5
output[17]: 5
output[18]: 5
output[19]: 5
output[20]: 6
output[21]: 7
output[22]: 9
output[23]: 11
output[24]: 19
output[25]: 5000000
output[26]: 5000000
output[27]: 5000000
...
output[4878]: 2
output[4879]: 2
output[4880]: 2
output[4881]: 2
output[4882]: 2
output[4883]: 2
output[4884]: 2
output[4885]: 2
output[4886]: 2
output[4887]: 2
output[4888]: 2
output[4889]: 2
output[4890]: 2
output[4891]: 2
output[4892]: 2
output[4893]: 2
output[4894]: 2
output[4895]: 2
output[4896]: 2
output[4897]: 2
output[4898]: 2
output[4899]: 2
Bogus output when max_it
is set to 6,000,000:
output[0]: 0
output[1]: 0
output[2]: 0
output[3]: 0
output[4]: 0
output[5]: 0
output[6]: 0
output[7]: 0
output[8]: 0
output[9]: 0
output[10]: 0
output[11]: 0
output[12]: 0
output[13]: 0
output[14]: 0
output[15]: 0
output[16]: 0
output[17]: 0
output[18]: 0
output[19]: 0
output[20]: 0
output[21]: 0
output[22]: 0
output[23]: 0
output[24]: 0
output[25]: 0
output[26]: 0
output[27]: 0
...
output[4877]: 0
output[4878]: -1161699328
output[4879]: 32649
output[4880]: -1698402160
output[4881]: 32767
output[4882]: -1177507963
output[4883]: 32649
output[4884]: 6431616
output[4885]: 0
output[4886]: -1174325376
output[4887]: 32649
output[4888]: -1698402384
output[4889]: 32767
output[4890]: 4199904
output[4891]: 0
output[4892]: -1698402160
output[4893]: 32767
output[4894]: -1177511704
output[4895]: 32649
output[4896]: -1174325376
output[4897]: 32649
output[4898]: -1177559142
output[4899]: 32649
And here follows the code:
mandelbrot.cpp (main file)
#include "mandelbrot.h"
#include <iostream>
#include <cstdlib>
using namespace std;
int main(int argc, char** argv) {
const int kNumPixelsRow = 70;
const int kNumPixelsCol = 70;
if (argc != 6) {
cout << "Must provide 5 arguments: " << endl
<< " #1: Lower left corner X coordinate (x0)" << endl
<< " #2: Lower left corner Y coordinate (y0)" << endl
<< " #3: Upper right corner X coordinate (x1)" << endl
<< " #4: Upper right corner Y coordinate (y1)" << endl
<< " #5: Maximum number of iterations" << endl;
return 0;
}
float x0 = (float) atof(argv[1]);
if (x0 < -2.5) {
cout << "x0 is too small, must be larger than -2.5" << endl;
return 0;
}
float y0 = (float) atof(argv[2]);
if (y0 < -1) {
cout << "y0 is too small, must be larger than -1" << endl;
return 0;
}
float x1 = (float) atof(argv[3]);
if (x1 > 1) {
cout << "x1 is too large, must be smaller than 1" << endl;
return 0;
}
float y1 = (float) atof(argv[4]);
if (y1 > 1) {
cout << "x0 is too large, must be smaller than 1" << endl;
return 0;
}
int max_it = atoi(argv[5]);
if (max_it <= 0) {
cout << "max_it is too small, must be larger than 0" << endl;
return 0;
}
cout << "Generating input data..." << endl;
float input_array[kNumPixelsRow][kNumPixelsCol][3];
float delta_x = (x1 - x0) / kNumPixelsRow;
float delta_y = (y1 - y0) / kNumPixelsCol;
for (int x = 0; x < kNumPixelsCol; ++x) {
for (int y = 0; y < kNumPixelsRow; ++y) {
if (x == 0) {
input_array[x][y][0] = x0;
}
else {
input_array[x][y][0] = input_array[x - 1][y][0] + delta_x;
}
if (y == 0) {
input_array[x][y][1] = y0;
}
else {
input_array[x][y][1] = input_array[x][y - 1][1] + delta_y;
}
input_array[x][y][2] = (float) max_it;
}
}
cout << "Executing..." << endl;
struct ModelOutput output = executeModel((float*) input_array);
cout << "Done." << endl;
for (int i = 0; i < kNumPixelsRow * kNumPixelsCol; ++i) {
cout << "output[" << i << "]: " << output.value1[i] << endl;
}
return 0;
}
mandelbrot.h (header file)
////////////////////////////////////////////////////////////
// AUTO-GENERATED BY f2cc 0.1
////////////////////////////////////////////////////////////
/**
* C struct for retrieving the output values from the model.
* This is needed since C functions can only return a single
* value.
*/
struct ModelOutput {
/**
* Output from process "parallelmapSY_1".
*/
int value1[4900];
};
/**
* Executes the model.
*
* @param input1
* Input to process "parallelmapSY_1".
* Expects an array of size 14700.
* @returns A struct containing the model outputs.
*/
struct ModelOutput executeModel(const float* input1);
mandelbrot.cu (CUDA file)
////////////////////////////////////////////////////////////
// AUTO-GENERATED BY f2cc 0.1
////////////////////////////////////////////////////////////
#include "mandelbrot.h"
__device__
int parallelmapSY_1_func1(const float* args) {
float x0 = args[0];
float y0 = args[1];
int max_it = (int) args[2];
float x = 0;
float y = 0;
int i = 0;
while (x*x + y*y < (2*2) && i < max_it) {
float x_temp = x*x - y*y + x0;
y = 2*x*y + y0;
x = x_temp;
++i;
}
return i;
}
__global__
void parallelmapSY_1__kernel(const float* input, int* output) {
unsigned int index = (blockIdx.x * blockDim.x + threadIdx.x);
if (index < 4900) {
output[index] = parallelmapSY_1_func1(&input[index * 3]);
}
}
void parallelmapSY_1__kernel_wrapper(const float* input, int* output) {
float* device_input;
int* device_output;
struct cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, 0);
int max_block_size = prop.maxThreadsPerBlock;
int num_blocks = (4900 + max_block_size - 1) / max_block_size;
cudaMalloc((void**) &device_input, 14700 * sizeof(float));
cudaMalloc((void**) &device_output, 4900 * sizeof(int));
cudaMemcpy((void*) device_input, (void*) input, 14700 * sizeof(float), cudaMemcpyHostToDevice);
dim3 grid(num_blocks, 1);
dim3 blocks(max_block_size, 1);
parallelmapSY_1__kernel<<<grid, blocks>>>(device_input, device_output);
cudaMemcpy((void*) output, (void*) device_output, 4900 * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree((void*) device_input);
cudaFree(((void*) device_output);
}
struct ModelOutput executeModel(const float* input1) {
// Declare signal variables
// Signals part of DelaySY processes are also initiated with delay value
float model_input_to_parallelmapSY_1_in[14700];
int parallelmapSY_1_out_to_model_output[4900];
// Copy model inputs to signal variables
for (int i = 0; i < 14700; ++i) {
model_input_to_parallelmapSY_1_in[i] = input1[i];
}
// Execute processes
parallelmapSY_1__kernel_wrapper(model_input_to_parallelmapSY_1_in, parallelmapSY_1_out_to_model_output);
// Copy model output values to return container
struct ModelOutput outputs;
for (int i = 0; i < 4900; ++i) {
outputs.value1[i] = parallelmapSY_1_out_to_model_output[i];
}
return outputs;
}
The interesting file is mandelbrot.cu
as that contains the computational code; mandelbrot.cpp
is just a driver to get user input and generate input data, and mandelbrot.h
is just a header file so that mandelbrot.cpp
can easily use mandelbrot.cu
.
The function executeModel()
is a wrapper function which takes care of propagating data between the processes in the model. In this case there is only one process so executeModel()
is rather pointless.
parallelmapSY_1__kernel_wrapper()
prepares the parallel execution by allocating memory on the device, transfers the input data, invokes the kernel, and transfers the result back to the host.
parallelmapSY_1__kernel()
is the kernel function, which simply calls parallelmapSY_1_func1()
with the appropriate input data. It also prevents execution when too many threads have been spawned.
So the real area of interest is parallelmapSY_1_func1()
. As I said, it works perfectly when the maximum iteration count is less than 5,500,000, but when I go higher it just doesn't seem to work as it's supposed to (see output log above). Some may ask "Why are you setting the iteration count so high? That's not necessary!". True, but since the pure C equivalent works perfectly with higher maximum iteration counts, why shouldn't the CUDA version? Since I'm designing a general tool, I need to know why it doesn't work in this example.
So does anyone have any idea what the code appears to fail when the maximum iteration count fails when exceeding 5,500,000?