I'm new in OpenCl. I wrote an OpenCL software rasterizer to rasterize triangles. Now the time that is used for a cube is 32Seconds, which is too much, I'm testing on nVidia RTX3080 Laptop. The result is very weird and it's too slow.
Here is the kernel,
___kernel void fragment_shader(__global struct Fragment* fragments, __global struct Triangle_* triangles, int triCount)
{
size_t px = get_global_id(0); // triCount
//size_t py = get_global_id(1); // triCount
int imageWidth = 256;
int imageHeight = 256;
if(px < triCount)
{
float3 v0Raster = (float3)(triangles[px].v[0].pos[0], triangles[px].v[0].pos[1], triangles[px].v[0].pos[2]);
float3 v1Raster = (float3)(triangles[px].v[1].pos[0], triangles[px].v[1].pos[1], triangles[px].v[1].pos[2]);
float3 v2Raster = (float3)(triangles[px].v[2].pos[0], triangles[px].v[2].pos[1], triangles[px].v[2].pos[2]);
float xmin = min3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymin = min3(v0Raster.y, v1Raster.y, v2Raster.y);
float xmax = max3(v0Raster.x, v1Raster.x, v2Raster.x);
float ymax = max3(v0Raster.y, v1Raster.y, v2Raster.y);
float slope = (ymax - ymin) / (xmax - xmin);
// be careful xmin/xmax/ymin/ymax can be negative. Don't cast to uint32_t
unsigned int x0 = max((uint)0, (uint)(floor(xmin)));
unsigned int x1 = min((uint)(imageWidth) - 1, (uint)(floor(xmax)));
unsigned int y0 = max((uint)0, (uint)(floor(ymin)));
unsigned int y1 = min((uint)(imageHeight) - 1, (uint)(floor(ymax)));
float3 v0 = v0Raster;
float3 v1 = v1Raster;
float3 v2 = v2Raster;
float area = edgeFunction(v0Raster, v1Raster, v2Raster);
for (unsigned int y = y0; y <= y1; ++y) {
for (unsigned int x = x0; x <= x1; ++x) {
float3 p = { x + 0.5f, y + 0.5f, 0 };
float w0 = edgeFunction(v1Raster, v2Raster, p);
float w1 = edgeFunction(v2Raster, v0Raster, p);
float w2 = edgeFunction(v0Raster, v1Raster, p);
if (w0 >= 0 && w1 >= 0 && w2 >= 0) {
fragments[y * 256 + x].col[0] = 1.0f;
fragments[y * 256 + x].col[1] = 0;
fragments[y * 256 + x].col[2] = 0;
}
}
}
}
}
The kernel is supposed to run for every triangle, and does box testing and rasterize the pixels. here is how I invoke it:
global_size[0] = triCount-1;
auto time_start = std::chrono::high_resolution_clock::now();
err = clEnqueueNDRangeKernel(commandQueue, kernel_fragmentShader, 1, NULL, global_size,
NULL, 0, NULL, NULL);
if (err < 0) {
perror("Couldn't enqueue the kernel_fragmentShader");
exit(1);
}
I tried to omit lighting and everything still it takes around 20seconds to render a cube.