Optimizing a raytracing shader in GLSL

Question

I have coded a voxelization based raytracer which is working as expected but is very slow.

Currently the raytracer code is as follows:

#version 430 
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;

out vec4 fragment_color;

struct Voxel
{
    vec4 position;
    vec4 normal;
    vec4 color;
};

struct Node
{
    //children of the current node
    int children[8];
};

layout(std430, binding = 0) buffer voxel_buffer
{
    //last layer of the tree, the leafs
    Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
    uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
    //tree structure       
    Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
    uint t_index;
};

uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels

#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
    bool inside = true;
    position-=corner;//coordinate of the position relative to the box coordinate system
    //Test that all coordinates are inside the box, if any is outisde, the point is out the box
    for(int i=0; i<3; i++)
    {
        inside = inside && (position[i] > -EPSILON);
        inside = inside && (position[i] < size+EPSILON);
    }

    return inside;
}

//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
    dir = normalize(dir);
    vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box

    float coeffs[6];
    //Calculate the intersaction coefficients with te 6 bonding planes 
    coeffs[0] = (corner0.x - origin.x)/(dir.x);
    coeffs[1] = (corner0.y - origin.y)/(dir.y);
    coeffs[2] = (corner0.z - origin.z)/(dir.z);

    coeffs[3] = (corner1.x - origin.x)/(dir.x);
    coeffs[4] = (corner1.y - origin.y)/(dir.y);
    coeffs[5] = (corner1.z - origin.z)/(dir.z);
    //by default the distance to the box is infinity
    float t = 1.f/0.f;

    for(uint i=0; i<6; i++){
        //if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
        coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
        //The distance is the minumum of the previous calculated distance and the current distance
        t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
    }

    return t;
}

#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;

void push(int node, int level, vec3 corner)
{
    nodes[sp] = node;
    levels[sp] = level;
    positions[sp] = corner;
    sp++;
}

void main()
{   
    int count = 0; //count the iterations of the algorithm
    vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
    r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
    vec3 dir = r;
    r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position

    fragment_color = vec4(0);
    int max_level = int(log2(voxel_resolution));//height of the tree
    push(0,0,vec3(-cube_dim));//set the stack
    float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
    //tree variables
    int level=0;
    int node=0;
    vec3 corner;

    do
    {
        //pop from stack
        sp--;
        node = nodes[sp];
        level = levels[sp];
        corner = positions[sp];

        //set the size of the current voxel 
        float size = cube_dim / pow(2,level);
        //set the corners of the children
        vec3 corners[] =
            {corner,                        corner+vec3(0,0,size),
            corner+vec3(0, size,0),         corner+vec3(0,size,size),
            corner+vec3(size,0,0),          corner+vec3(size,0,size),
            corner+vec3(size,size,0),       corner+vec3(size,size,size)};

        float coeffs[8];
        for(int child=0; child<8; child++)
        {
            //Test non zero childs, zero childs are empty and thus should be discarded
            coeffs[child] = tree[node].children[child]>0?
                //Get the distance to your child if it's not empty or infinity if it's empty
                boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
        }
        int indices[8] = {0,1,2,3,4,5,6,7};
        //sort the children from closest to farthest
        for(uint i=0; i<8; i++)
        {
            for(uint j=i; j<8; j++)
            {
                if((coeffs[j] < coeffs[i]))
                {
                    float swap = coeffs[i];
                    coeffs[i] = coeffs[j];
                    coeffs[j] = swap;

                    int iSwap = indices[i];
                    indices[i] = indices[j];
                    indices[j] = iSwap;

                    vec3 vSwap = corners[i];
                    corners[i] = corners[j];
                    corners[j] = vSwap;
                }
            }
        }
        //push to stack
        for(uint i=7; i>=0; i--)
        {
            if(!isinf(coeffs[i]))
            {
                push(tree[node].children[indices[i]],
                    level+1, corners[i]);
            }
        }
        count++;
    }while(level < (max_level-1) && sp>0);
    //set color
    fragment_color = vec4(count)/100;
}

As it may not be fully clear what this does, let me explain.

We check ray-box intersections starting with a big cube. If we hit it we test intersection with the 8 cubes that compose it.

If we hit any fo those we check intersections with the 8 cubes that make up that cube.

In 2D this would look as follows:

In this case we have 4 layers, we check the big box first, then the ones colored in red, then the ones colored in green, and finally the ones colored in blue.

Printing out the number of times the raytracing step executed as a color (which is what the code snippet I have provided does)

results in the following image:

As you can see, most of the time the shader doesn't do more than 100 iterations.

However this shader takes 200 000 microseconds to execute on average in a gtx 1070.

Since the issue is not number of executions, my problem is likely to be on thread execution.

Does anyone know how I could optimize this code? The biggest botttleneck seems to be the use of a stack.

If I run the same code without pushing to the stack (generating wrong output), there is a 10 fold improvement in runtime

why not store your mesh in a 3D texture and directly use 3d texture coordinates? no need for any stack and stuff around the spatial subdivision see [How to best write a voxel engine in C with performance in mind](https://stackoverflow.com/a/48092685/2521214) where is my GLSL attempt for this made from this [Reflection and refraction impossible without recursive ray tracing?](https://stackoverflow.com/a/45140313/2521214) ray-tracer of mine. Also take a look at [What techniques were used to reduce the required re-rendering in 3D programs?](https://retrocomputing.stackexchange.com/a/5903/6868) — Spektre, Jun 10 '18 at 06:08
Already coded that one. Issue here is scalability of data and asymptotic complexity. If you use a 3D texture most of it is void (wasted space). So for a scene with 512 voxels, you end up allcoating 512^3*size_of_encoding. However the vast majority of it is not needed. Second, unless you cleverly create the mip_maps, the asymptotic complexity for the image is liner in the side length of the texture (e.g 512) however mine is amortized cost log(n) and it uses only as much data as needed. — Makogan, Jun 10 '18 at 06:57
@Spektre to give you an idea, my gpu could not allocate a 1024 3D texture, but can handle a 2048 oct tree And a 512 3D texture consummed about 2 gigabytes of VRAM, whereas this one can do away with only 200 MB — Makogan, Jun 10 '18 at 06:59
I suggest you try running this multi staged. Write the result after each iteration and then rerun the shader and skip pixels with a hit in the vertex stage. Do you currently just have one screen sized rectangle for vertices? You could use a point for every pixel and set the position for finished pixels behind the camera in subsequent passes. — Darklighter, Jun 12 '18 at 03:26
Yes i have a scren sized rectangle for ray tracing, I do not quite understand what you are describing however. — Makogan, Jun 12 '18 at 04:44
You have some commented out code and unused variables in there. If you cleanup the code and add more inline comments to describe what each step does, I can have look. — bernie, Jun 12 '18 at 13:04
@bernie I deleted the commented code, although I am not sure which unused variables you are refferring to — Makogan, Jun 12 '18 at 20:30
@Makogan normals is unused. I haven't done an exhaustive check for others after noticing that — bernie, Jun 12 '18 at 20:31
What is the desired result? Is it the nearest voxel hit or do you really just want to count how many octrees are tested? — bernie, Jun 12 '18 at 21:42
@bernie currently the intended result is just to see how manh iterations were needed to compute a voxel collission. The actual final use case is to sue this method to compute real time global illumination, — Makogan, Jun 12 '18 at 23:36
@Makogan I am not sure if it is relevant but you can implement path tracing without a call stack. I have very recently did the following: https://stackoverflow.com/a/62057711/7330813 — Kaan E., May 28 '20 at 05:41

score 2 · Answer 1 · answered Jun 12 '18 at 18:09

It seems you test for intersection with the ray most of all voxels in each level of the octree. And sort them (by some distance) also in each level. I propose another approach.

If the ray intersects with the bounding box (level 0 of the octree) it makes it at two faces of the box. Or in a corner or an edge, these are "corner" cases.

Finding the 3D ray-plane intersection can be done like here. Finding if the intersection is inside the face (quad) can be done by testing if the point is inside of one of the two triangles of the face, like here.

Get the farthest intersection I0 from the camera. Also let r be a unit vector of the ray in the direction I0 toward the camera.

Find the deepest voxel for I0 coordinates. This is the farthest voxel from the camera.

Now we want the exit-coordinates I0e for the ray in that voxel, through another face. While you could do again the calculations for all 6 faces, if your voxels are X,Y,X aligned and you define the ray in the same coordinates system as the octree, then calculae simplify a lot.

Apply a little displacement (e.g. a 1/1000 of the smallest voxel size) to I0e by the r unit vector of the ray: I1 = I0e + r/1000. Find the voxel to these I1. This is the next voxel in the sorted list of voxel-ray intersections.

Repeat finding I1e then I2 then I2e then I3 etc. until the bounding box is exited. The list of crossed voxels is sorted.

Working with the octree can be optimized depending on how you store its info: All possible nodes or just used. Nodes with data or just "pointers" to another container with the data. This is matter for another question.

This will be slower than the intersection check I am using – Makogan Jun 12 '18 at 21:04 — Makogan, Jun 12 '18 at 21:04

score 2 · Answer 2 · answered Jun 13 '18 at 19:01

The first thing that stands out is your box intersection function. Have a look at inigo quilez' procedural box function for a much faster version. Since your boxsize is uniform in all axes and you don't need outNormal, you can get an even lighter version. In essence, use maths instead of the brute force approach that tests each box plane.

Also, try to avoid temporary storage where possible. For example, the corners array could be computed on demand for each octree box. Of course, with the above suggestion, these will be changed to box centers.

Since nodes, levels and positions are always accessed together, try co-locating them in a new single struct and access them as a single unit.

Will look more later...

score 1 · Accepted Answer · answered Jun 18 '18 at 23:14

Thread execution on a GPU may be massively parallel, but that doesn’t mean that all threads run independently from one another. Groups of threads execute exactly the same instructions, the only difference is the input data. That means that branches and therefore loops can’t make a thread diverge in execution and therefore also not let them terminate early.

Your example shows the most extreme edge case of this: when there is a high likelyhood that in a group of threads all work that’s done is relevant to one thread only.

To alleviate this, you should try to reduce the difference in execution length (iterations in your case) for threads in a group (or in total). This can be done by setting a limit on the number of iterations per shader pass and rescheduling only those threads/pixels that need more iterations.

It’s not like GLSL is meant for this. As i suggested in my comment, you could e.g. have point vertices for every pixel in the image, and for subsequent passes move the points which are already finished in such a way that they are outside your clipping planes. The fragments are hopefully scheduled properly then. — Darklighter, Jun 18 '18 at 23:48
@Makogan can you let me know after you tried it (if you try it)? — Darklighter, Jun 19 '18 at 22:53

Optimizing a raytracing shader in GLSL

3 Answers3