I'm trying to optimize the following Kernel using OpenCL. Basically I'm doing triangle rasterization using a frame buffer of 800*800 pixels, and checking the bounding box of each triangle and if the current pixel that is scanned line is inside the box of the triangle then I rasterize it.
here is the kernel and how do I invoke it:
global_size[0] = 800;
global_size[1] = 800;
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
And the kernel:
size_t px = get_global_id(0); // triCount
size_t py = get_global_id(1); // triCount
int width = 800;
int height = 800;
float3 v0Raster = (float3)(triangles[px].v[0].pos[0], triangles[px].v[0].pos[1], triangles[px].v[0].pos[2]);
float3 v1Raster = (float3)(triangles[px].v[1].pos[0], triangles[px].v[1].pos[1], triangles[px].v[1].pos[2]);
float3 v2Raster = (float3)(triangles[px].v[2].pos[0], triangles[px].v[2].pos[1], triangles[px].v[2].pos[2]);
float xmin = min3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymin = min3(v0Raster.y, v1Raster.y, v2Raster.y);
float xmax = max3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymax = max3(v0Raster.y, v1Raster.y, v2Raster.y);
float slope = (ymax - ymin) / (xmax - xmin);
int dp, y;
bool discard_;
float ratio;
for (int x = round(xmin); x <= round(xmax); x++) {
y = slope * (x - round(xmin) + ymin);
ratio = (x - round(xmin) / (round(xmax) - round(xmin)));
discard_ = false;
int flatIdx = width - x + (height - y) * width;
if (y < 0 || y > height || x < 0 || x > width) {
discard_ = true;
}
if (!discard_) {
fragments[flatIdx].col[0] = 1.0f;
fragments[flatIdx].col[1] = 0;
fragments[flatIdx].col[2] = 0;
}
}
For a cube it takes one second to render it, which is so slow..