I have coded a voxelization based raytracer which is working as expected but is very slow.
Currently the raytracer code is as follows:
#version 430
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;
out vec4 fragment_color;
struct Voxel
{
vec4 position;
vec4 normal;
vec4 color;
};
struct Node
{
//children of the current node
int children[8];
};
layout(std430, binding = 0) buffer voxel_buffer
{
//last layer of the tree, the leafs
Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
//tree structure
Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
uint t_index;
};
uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels
#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
bool inside = true;
position-=corner;//coordinate of the position relative to the box coordinate system
//Test that all coordinates are inside the box, if any is outisde, the point is out the box
for(int i=0; i<3; i++)
{
inside = inside && (position[i] > -EPSILON);
inside = inside && (position[i] < size+EPSILON);
}
return inside;
}
//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
dir = normalize(dir);
vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box
float coeffs[6];
//Calculate the intersaction coefficients with te 6 bonding planes
coeffs[0] = (corner0.x - origin.x)/(dir.x);
coeffs[1] = (corner0.y - origin.y)/(dir.y);
coeffs[2] = (corner0.z - origin.z)/(dir.z);
coeffs[3] = (corner1.x - origin.x)/(dir.x);
coeffs[4] = (corner1.y - origin.y)/(dir.y);
coeffs[5] = (corner1.z - origin.z)/(dir.z);
//by default the distance to the box is infinity
float t = 1.f/0.f;
for(uint i=0; i<6; i++){
//if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
//The distance is the minumum of the previous calculated distance and the current distance
t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
}
return t;
}
#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;
void push(int node, int level, vec3 corner)
{
nodes[sp] = node;
levels[sp] = level;
positions[sp] = corner;
sp++;
}
void main()
{
int count = 0; //count the iterations of the algorithm
vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
vec3 dir = r;
r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position
fragment_color = vec4(0);
int max_level = int(log2(voxel_resolution));//height of the tree
push(0,0,vec3(-cube_dim));//set the stack
float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
//tree variables
int level=0;
int node=0;
vec3 corner;
do
{
//pop from stack
sp--;
node = nodes[sp];
level = levels[sp];
corner = positions[sp];
//set the size of the current voxel
float size = cube_dim / pow(2,level);
//set the corners of the children
vec3 corners[] =
{corner, corner+vec3(0,0,size),
corner+vec3(0, size,0), corner+vec3(0,size,size),
corner+vec3(size,0,0), corner+vec3(size,0,size),
corner+vec3(size,size,0), corner+vec3(size,size,size)};
float coeffs[8];
for(int child=0; child<8; child++)
{
//Test non zero childs, zero childs are empty and thus should be discarded
coeffs[child] = tree[node].children[child]>0?
//Get the distance to your child if it's not empty or infinity if it's empty
boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
}
int indices[8] = {0,1,2,3,4,5,6,7};
//sort the children from closest to farthest
for(uint i=0; i<8; i++)
{
for(uint j=i; j<8; j++)
{
if((coeffs[j] < coeffs[i]))
{
float swap = coeffs[i];
coeffs[i] = coeffs[j];
coeffs[j] = swap;
int iSwap = indices[i];
indices[i] = indices[j];
indices[j] = iSwap;
vec3 vSwap = corners[i];
corners[i] = corners[j];
corners[j] = vSwap;
}
}
}
//push to stack
for(uint i=7; i>=0; i--)
{
if(!isinf(coeffs[i]))
{
push(tree[node].children[indices[i]],
level+1, corners[i]);
}
}
count++;
}while(level < (max_level-1) && sp>0);
//set color
fragment_color = vec4(count)/100;
}
As it may not be fully clear what this does, let me explain.
We check ray-box intersections starting with a big cube. If we hit it we test intersection with the 8 cubes that compose it.
If we hit any fo those we check intersections with the 8 cubes that make up that cube.
In 2D this would look as follows:
In this case we have 4 layers, we check the big box first, then the ones colored in red, then the ones colored in green, and finally the ones colored in blue.
Printing out the number of times the raytracing step executed as a color (which is what the code snippet I have provided does)
results in the following image:
As you can see, most of the time the shader doesn't do more than 100 iterations.
However this shader takes 200 000 microseconds to execute on average in a gtx 1070.
Since the issue is not number of executions, my problem is likely to be on thread execution.
Does anyone know how I could optimize this code? The biggest botttleneck seems to be the use of a stack.
If I run the same code without pushing to the stack (generating wrong output), there is a 10 fold improvement in runtime