Optimizing performance of a heavy fragment shader

Question

I need help optimizing the following set of shaders:

Vertex:

    precision mediump float;

uniform vec2 rubyTextureSize;

attribute vec4 vPosition;
attribute vec2 a_TexCoordinate;

varying vec2 tc;

void main() {
    gl_Position = vPosition;

    tc = a_TexCoordinate;
}

Fragment:

precision mediump float;

/*
 Uniforms
 - rubyTexture: texture sampler
 - rubyTextureSize: size of the texture before rendering
 */

uniform sampler2D rubyTexture;
uniform vec2 rubyTextureSize;
uniform vec2 rubyTextureFract;

/*
 Varying attributes
 - tc: coordinate of the texel being processed
 - xyp_[]_[]_[]: a packed coordinate for 3 areas within the texture
 */

varying vec2 tc;

/*
 Constants
 */
/*
 Inequation coefficients for interpolation
 Equations are in the form: Ay + Bx = C
 45, 30, and 60 denote the angle from x each line the cooeficient variable set builds
 */
const vec4 Ai = vec4(1.0, -1.0, -1.0, 1.0);
const vec4 B45 = vec4(1.0, 1.0, -1.0, -1.0);
const vec4 C45 = vec4(1.5, 0.5, -0.5, 0.5);
const vec4 B30 = vec4(0.5, 2.0, -0.5, -2.0);
const vec4 C30 = vec4(1.0, 1.0, -0.5, 0.0);
const vec4 B60 = vec4(2.0, 0.5, -2.0, -0.5);
const vec4 C60 = vec4(2.0, 0.0, -1.0, 0.5);

const vec4 M45 = vec4(0.4, 0.4, 0.4, 0.4);
const vec4 M30 = vec4(0.2, 0.4, 0.2, 0.4);
const vec4 M60 = M30.yxwz;
const vec4 Mshift = vec4(0.2);

// Coefficient for weighted edge detection
const float coef = 2.0;
// Threshold for if luminance values are "equal"
const vec4 threshold = vec4(0.32);

// Conversion from RGB to Luminance (from GIMP)
const vec3 lum = vec3(0.21, 0.72, 0.07);

// Performs same logic operation as && for vectors
bvec4 _and_(bvec4 A, bvec4 B) {
    return bvec4(A.x && B.x, A.y && B.y, A.z && B.z, A.w && B.w);
}

// Performs same logic operation as || for vectors
bvec4 _or_(bvec4 A, bvec4 B) {
    return bvec4(A.x || B.x, A.y || B.y, A.z || B.z, A.w || B.w);
}

// Converts 4 3-color vectors into 1 4-value luminance vector
vec4 lum_to(vec3 v0, vec3 v1, vec3 v2, vec3 v3) {
    //    return vec4(dot(lum, v0), dot(lum, v1), dot(lum, v2), dot(lum, v3));

    return mat4(v0.x, v1.x, v2.x, v3.x, v0.y, v1.y, v2.y, v3.y, v0.z, v1.z,
            v2.z, v3.z, 0.0, 0.0, 0.0, 0.0) * vec4(lum, 0.0);
}

// Gets the difference between 2 4-value luminance vectors
vec4 lum_df(vec4 A, vec4 B) {
    return abs(A - B);
}

// Determines if 2 4-value luminance vectors are "equal" based on threshold
bvec4 lum_eq(vec4 A, vec4 B) {
    return lessThan(lum_df(A, B), threshold);
}

vec4 lum_wd(vec4 a, vec4 b, vec4 c, vec4 d, vec4 e, vec4 f, vec4 g, vec4 h) {
    return lum_df(a, b) + lum_df(a, c) + lum_df(d, e) + lum_df(d, f)
            + 4.0 * lum_df(g, h);
}

// Gets the difference between 2 3-value rgb colors
float c_df(vec3 c1, vec3 c2) {
    vec3 df = abs(c1 - c2);
    return df.r + df.g + df.b;
}

void main() {

    /*
     Mask for algorhithm
     +-----+-----+-----+-----+-----+
     |     |  1  |  2  |  3  |     |
     +-----+-----+-----+-----+-----+
     |  5  |  6  |  7  |  8  |  9  |
     +-----+-----+-----+-----+-----+
     | 10  | 11  | 12  | 13  | 14  |
     +-----+-----+-----+-----+-----+
     | 15  | 16  | 17  | 18  | 19  |
     +-----+-----+-----+-----+-----+
     |     | 21  | 22  | 23  |     |
     +-----+-----+-----+-----+-----+
     */

    float x = rubyTextureFract.x;
    float y = rubyTextureFract.y;

    vec4 xyp_1_2_3 = tc.xxxy + vec4(-x, 0.0, x, -2.0 * y);
    vec4 xyp_6_7_8 = tc.xxxy + vec4(-x, 0.0, x, -y);
    vec4 xyp_11_12_13 = tc.xxxy + vec4(-x, 0.0, x, 0.0);
    vec4 xyp_16_17_18 = tc.xxxy + vec4(-x, 0.0, x, y);
    vec4 xyp_21_22_23 = tc.xxxy + vec4(-x, 0.0, x, 2.0 * y);
    vec4 xyp_5_10_15 = tc.xyyy + vec4(-2.0 * x, -y, 0.0, y);
    vec4 xyp_9_14_9 = tc.xyyy + vec4(2.0 * x, -y, 0.0, y);

    // Get mask values by performing texture lookup with the uniform sampler
    vec3 P1 = texture2D(rubyTexture, xyp_1_2_3.xw).rgb;
    vec3 P2 = texture2D(rubyTexture, xyp_1_2_3.yw).rgb;
    vec3 P3 = texture2D(rubyTexture, xyp_1_2_3.zw).rgb;

    vec3 P6 = texture2D(rubyTexture, xyp_6_7_8.xw).rgb;
    vec3 P7 = texture2D(rubyTexture, xyp_6_7_8.yw).rgb;
    vec3 P8 = texture2D(rubyTexture, xyp_6_7_8.zw).rgb;

    vec3 P11 = texture2D(rubyTexture, xyp_11_12_13.xw).rgb;
    vec3 P12 = texture2D(rubyTexture, xyp_11_12_13.yw).rgb;
    vec3 P13 = texture2D(rubyTexture, xyp_11_12_13.zw).rgb;

    vec3 P16 = texture2D(rubyTexture, xyp_16_17_18.xw).rgb;
    vec3 P17 = texture2D(rubyTexture, xyp_16_17_18.yw).rgb;
    vec3 P18 = texture2D(rubyTexture, xyp_16_17_18.zw).rgb;

    vec3 P21 = texture2D(rubyTexture, xyp_21_22_23.xw).rgb;
    vec3 P22 = texture2D(rubyTexture, xyp_21_22_23.yw).rgb;
    vec3 P23 = texture2D(rubyTexture, xyp_21_22_23.zw).rgb;

    vec3 P5 = texture2D(rubyTexture, xyp_5_10_15.xy).rgb;
    vec3 P10 = texture2D(rubyTexture, xyp_5_10_15.xz).rgb;
    vec3 P15 = texture2D(rubyTexture, xyp_5_10_15.xw).rgb;

    vec3 P9 = texture2D(rubyTexture, xyp_9_14_9.xy).rgb;
    vec3 P14 = texture2D(rubyTexture, xyp_9_14_9.xz).rgb;
    vec3 P19 = texture2D(rubyTexture, xyp_9_14_9.xw).rgb;

    // Store luminance values of each point in groups of 4
    // so that we may operate on all four corners at once
    vec4 p7 = lum_to(P7, P11, P17, P13);
    vec4 p8 = lum_to(P8, P6, P16, P18);
    vec4 p11 = p7.yzwx; // P11, P17, P13, P7
    vec4 p12 = lum_to(P12, P12, P12, P12);
    vec4 p13 = p7.wxyz; // P13, P7,  P11, P17
    vec4 p14 = lum_to(P14, P2, P10, P22);
    vec4 p16 = p8.zwxy; // P16, P18, P8,  P6
    vec4 p17 = p7.zwxy; // P17, P13, P7,  P11
    vec4 p18 = p8.wxyz; // P18, P8,  P6,  P16
    vec4 p19 = lum_to(P19, P3, P5, P21);
    vec4 p22 = p14.wxyz; // P22, P14, P2,  P10
    vec4 p23 = lum_to(P23, P9, P1, P15);

    // Scale current texel coordinate to [0..1]
    vec2 fp = fract(tc * rubyTextureSize);

    // Determine amount of "smoothing" or mixing that could be done on texel corners
    vec4 AiMulFpy = Ai * fp.y;
    vec4 B45MulFpx = B45 * fp.x;
    vec4 ma45 = smoothstep(C45 - M45, C45 + M45, AiMulFpy + B45MulFpx);
    vec4 ma30 = smoothstep(C30 - M30, C30 + M30, AiMulFpy + B30 * fp.x);
    vec4 ma60 = smoothstep(C60 - M60, C60 + M60, AiMulFpy + B60 * fp.x);
    vec4 marn = smoothstep(C45 - M45 + Mshift, C45 + M45 + Mshift,
            AiMulFpy + B45MulFpx);

    // Perform edge weight calculations
    vec4 e45 = lum_wd(p12, p8, p16, p18, p22, p14, p17, p13);
    vec4 econt = lum_wd(p17, p11, p23, p13, p7, p19, p12, p18);
    vec4 e30 = lum_df(p13, p16);
    vec4 e60 = lum_df(p8, p17);

    // Calculate rule results for interpolation
    bvec4 r45_1 = _and_(notEqual(p12, p13), notEqual(p12, p17));
    bvec4 r45_2 = _and_(not (lum_eq(p13, p7)), not (lum_eq(p13, p8)));
    bvec4 r45_3 = _and_(not (lum_eq(p17, p11)), not (lum_eq(p17, p16)));
    bvec4 r45_4_1 = _and_(not (lum_eq(p13, p14)), not (lum_eq(p13, p19)));
    bvec4 r45_4_2 = _and_(not (lum_eq(p17, p22)), not (lum_eq(p17, p23)));
    bvec4 r45_4 = _and_(lum_eq(p12, p18), _or_(r45_4_1, r45_4_2));
    bvec4 r45_5 = _or_(lum_eq(p12, p16), lum_eq(p12, p8));
    bvec4 r45 = _and_(r45_1, _or_(_or_(_or_(r45_2, r45_3), r45_4), r45_5));
    bvec4 r30 = _and_(notEqual(p12, p16), notEqual(p11, p16));
    bvec4 r60 = _and_(notEqual(p12, p8), notEqual(p7, p8));

    // Combine rules with edge weights
    bvec4 edr45 = _and_(lessThan(e45, econt), r45);
    bvec4 edrrn = lessThanEqual(e45, econt);
    bvec4 edr30 = _and_(lessThanEqual(coef * e30, e60), r30);
    bvec4 edr60 = _and_(lessThanEqual(coef * e60, e30), r60);

    // Finalize interpolation rules and cast to float (0.0 for false, 1.0 for true)
    vec4 final45 = vec4(_and_(_and_(not (edr30), not (edr60)), edr45));
    vec4 final30 = vec4(_and_(_and_(edr45, not (edr60)), edr30));
    vec4 final60 = vec4(_and_(_and_(edr45, not (edr30)), edr60));
    vec4 final36 = vec4(_and_(_and_(edr60, edr30), edr45));
    vec4 finalrn = vec4(_and_(not (edr45), edrrn));

    // Determine the color to mix with for each corner
    vec4 px = step(lum_df(p12, p17), lum_df(p12, p13));

    // Determine the mix amounts by combining the final rule result and corresponding
    // mix amount for the rule in each corner
    vec4 mac = final36 * max(ma30, ma60) + final30 * ma30 + final60 * ma60
            + final45 * ma45 + finalrn * marn;

    /*
     Calculate the resulting color by traversing clockwise and counter-clockwise around
     the corners of the texel

     Finally choose the result that has the largest difference from the texel's original
     color
     */
    vec3 res1 = P12;
    res1 = mix(res1, mix(P13, P17, px.x), mac.x);
    res1 = mix(res1, mix(P7, P13, px.y), mac.y);
    res1 = mix(res1, mix(P11, P7, px.z), mac.z);
    res1 = mix(res1, mix(P17, P11, px.w), mac.w);

    vec3 res2 = P12;
    res2 = mix(res2, mix(P17, P11, px.w), mac.w);
    res2 = mix(res2, mix(P11, P7, px.z), mac.z);
    res2 = mix(res2, mix(P7, P13, px.y), mac.y);
    res2 = mix(res2, mix(P13, P17, px.x), mac.x);

    gl_FragColor = vec4(mix(res1, res2, step(c_df(P12, res1), c_df(P12, res2))),
            1.0);
}

The shaders receive a 2D texture and are meant to scale it beautifully across a high-res 2D surface (the device screen). It is an optimization of the SABR scaling algorithm in case it matters.

It already works, and performs OK on very high-end devices (like LG Nexus 4), but it is really slow on weaker devices.

The devices that really matter to me are Samsung Galaxy S 2 \ 3, with Mali 400MP GPU - which perform horribly with this shader.

So far I've tried:

Eliminating varyings (advice from ARM's Mali guide) - did minor improvement.
Overriding mix() functions with my own - did no good.
reducing float precision to lowp - didn't change anything.

I measure performance by calculating render time (before and after eglSwapBuffers) - this gives me a very linear and consistent measurement of performance.

Beyond that, I don't really know where to look or what can be optimized here...

I know that this is a heavy algorithm, and I am not asking for advice on what alternate scaling methods to use - I've tried many and this algorithm gives the best visual result. I wish to use the exact same algorithm in an optimized way.

UPDATE

I found that if I do all the texture fetches with a constant vector instead of dependent vectors I get a major performance improvement, so this is obviously a big bottleneck - probably because of the cache. However, I still need to do those fetches. I played with doing at least some of the fetches with vec2 varyings (without any swizzling) but it didn't improve anything. I wonder what might be a good way to efficiently poll 21 texels.
I found that a major part of the calculations is being done multiple times with the exact same set of texels - because the output is scaled by at least x2, and I poll with GL_NEAREST. There at least 4 fragments that fall on exactly the same texels. If the scaling is x4 on a high-res device, there are 16 fragments that fall on the same texels - which is a big waste. Is there any way to perform an additional shader pass that will calculate all the values that don't change across multiple fragments? I thought about rendering to an additional off-screen texture, but I need to store multiple values per texel, not just one.

UPDATE

Tried to simplify the boolean expressions using known boolean rules - saved me few operations but didn't have any effect on performance.

UPDATE

Thought about a way to pass calculations to the vertex shader - just have a "geometry" that creates my full screen, but with a lot of vertices that correspond to each original pixel before scaling. For example, if my original texture is 320x200 and my target screen is 1280x800, there will be 320x200 vertices spread evenly. Then, do most of the calculations in those vertices. Problem is - my target devices (S2 \ S3) don't support vertex texture sampling.

UPDATE

Measured performance on LG Nexus 4 vs. Samsung Galaxy S3 shows that Nexus 4 runs it more than 10 times faster. How can this be? These are 2 devices from the same generation, same resolution, etc... Could the Mali 400MP be really bad with certain situations? I'm sure there is something very specific that makes it run so slowly compared to Nexus 4 (but didn't find what yet).

I don't have time to read the whole program right now, but do you think you can do several passes and mix between passes? I don't think it may speed up anything but at least it may free some shader cores... The problem of removing varying is that is harder to predict texture lookups (although it gave you an speed up) — Trax, May 12 '13 at 18:11
Do you think you can separate you filter? Looks like you are doing a convolution there, if the kernel can be separated in vertical and horizontal passes you dont need to do 21 lookups but 10. — Trax, May 12 '13 at 18:28
Hi Trax, could you please elaborate on separating the filter? How would you sample the needed 21 texels with 10 lookups? — SirKnigget, May 12 '13 at 18:32
Regarding several passes - I thought about it (in the update), but there are multiple intermediate values before the final color output - how can multiple passes be done here? — SirKnigget, May 12 '13 at 18:33
With separable filter I mean if you can break it in first compute horizontal neighbours->output and use it as input for the vertical pass. But as you pointed out there are many interdependent products. — Trax, May 12 '13 at 18:49
On the question of multiple lookups hitting the same texel ... worrying about that is trying to optimise what is already a best-case. The duplicate taps will be texture cache hits and will already be fast. It is the long trip to memory to fill the texture cache and potentially a bit more work for decompression & format conversions that take the time in texture cache misses. — ahcox, May 20 '13 at 19:06
"It is an optimization of the SABR scaling algorithm in case it matters" Do you have a reference for the algorithm in question? — ahcox, May 20 '13 at 19:36
This link gives the general idea: board.byuu.org/viewtopic.php?f=10&t=2248, but it's not the exact implementation I use here. — SirKnigget, May 20 '13 at 23:45

score 3 · Answer 1 · answered May 18 '13 at 13:55

3

In my experience mobile GPU performance is roughly proportional to the number of texture2D calls. You have 21 which really is a lot. Generally memory lookups are on the order of hundreds of times slower than calculations, so you can do a lot of calculation and still be bottlenecked on the texture lookups. (This also means optimising the rest of your code probably will have little effect, since it means instead of being busy while it waits for the texture lookups, it will be idle while it waits for the texture lookups.) So you need to reduce the number of texture2Ds you call.

It's difficult to say how to do this since I don't really understand your shader, but some ideas:

separate it in to a horizontal pass then a vertical pass. This only works for some shaders e.g. blurs, but it can seriously reduce the number of texture lookups. For example a 5x5 gaussian blur naively does 25 texture lookups; if done horizontally then vertically, it only uses 10.
use linear filtering to 'cheat'. If you sample exactly between 4 pixels instead of the middle of 1 pixel with linear filtering enabled, you get the average of all 4 pixels for free. However I don't know how it affects your shader. In the blur example again, using linear filtering to sample two pixels at once either side of the middle pixel allows you to sample 5 pixels with 3 texture2D calls, reducing the 5x5 blur to just 6 samples for both horizontal and vertical.
just use a smaller kernel (so you don't take so many samples). This affects the quality, so you'd probably want some way to detect the device performance and switch to a lower quality shader when the device appears to be slow.

answered May 18 '13 at 13:55

AshleysBrain

22,335
15
88
124

Thanks - I understand the suggestions but I don't know how to apply it to the specific shader. Using 2 passes, or linear filtering, don't seem to produce the intermediate values that are required here. I already have a low-quality solution as a fallback. – SirKnigget May 18 '13 at 14:29
I thought of an unorthodox way of eliminating my texture2D calls - draw a grid of vertices instead of just one quad, and pass to each one of them attributes containing all the needed texels. That would eliminate all texture sampling, but create a large drawing buffer every frame. Before I attempt to implement this lengthy solution, any advice? – SirKnigget May 18 '13 at 19:18
Definitely avoid the grid of vertices. You'd be replacing work done in fixed function hardware on the GPU with work on the CPU _at_the_same_frequency_ ... a major lose. To be clear, in general, pulling work up the pipeline to the CPU can be a win in a case like gross culling where a little work on the CPU avoids a lot on the GPU, but that doesn't apply here. – ahcox May 20 '13 at 18:34
@ahcox - What about the fact that: 1. My CPU is at 0% during this execution and the GPU is a big bottleneck, and 2. Everything this fragment shader does will be done 4x-16x times less than now (depending on scaling factor). – SirKnigget May 20 '13 at 23:48
@SirKnigget That is starting to look like a completely different algorithm. Currently this does a gather per output pixel. It sounds as though you are intending to do a scatter per input from the vertex shader. I don't even see an efficient algorithm with that inversion of the problem there that can be expressed in terms of vertex and fragment programs. – ahcox May 21 '13 at 11:11
It will be the same algorithm. The gather per output pixel does the offsetting according to the original texture pixel size, and the filtering is GL_NEAREST. So effectively, a bunch of different fragments fall on the same original texels. Some of the calculations will still be relevant to each fragment though and the'll be done in the fragment shader. – SirKnigget May 21 '13 at 12:05

score 2 · Answer 2 · answered May 19 '13 at 10:42

2

There's a couple of Mali-400 oddities that you might need to be aware of:

You should really be using varyings without any swizzling for your texture lookups (i.e the calculation of "xyp_1_2_3.xw" etc in the vertex shader, and use one varying per texture look-up instead of swizzling them).
At some specific number of instructions (unfortunately NDAs prevents me from revealing this number), the performance drops off quite badly. You can get the instruction-count from the offline compiler. To remedy this, you might be able to split your shader into multiple smaller ones, and use the undocumented GL_ARM_framebuffer_read-extension to read the result of the previous one. (Google can tell you how to use it, it seems. Grepping a bit in the binaries of the offline shader-compiler might also help)

answered May 19 '13 at 10:42

kusma

6,516
2
22
26

I tried using multiple vec2 varying for the lookups (as much varyings as possible) and do the rest of the lookups the normal way. It just got slower... – SirKnigget May 19 '13 at 11:06
The vec2-stuff is more about precision than about performance, really. Otherwise, the varying has to go through a register, which is fp16. – kusma May 19 '13 at 11:53
How do you suggest to apply what you said? I checked for max varyings on that GPU, used all of them (as vec2) for passing texture coordinates and sampled with them, and sampled the rest of the texels with the normal offset calculations. That didn't help, but maybe I did something wrong. – SirKnigget May 19 '13 at 12:15
The latter of my two points is probably the more relevant one to your question. The first point is (as I said) about precision, not about performance. In some cases it can also help for performance, but it depends on your vertex-count etc. – kusma May 19 '13 at 12:26
Thanks, I'll look into that. Is the extension you mentioned applicable to most GPUs, or just Mali series? – SirKnigget May 19 '13 at 20:21
Googled it a bit - isn't the extension you mentioned just a way to use an FBO in older API versions? – SirKnigget May 19 '13 at 20:31
No, the extension is a vendor-specific, undocumented extension to read the previous content of the current fragment. It does only apply to the Mali-series, but some other vendors have similar extensions. – kusma May 20 '13 at 13:02
I did not find any documentation via Google. Can you link? – SirKnigget May 20 '13 at 15:49
I fail to see what am I supposed to do with this. Is that considered "documentation"? – SirKnigget May 20 '13 at 23:56
1

No, I do not consider that to be documented, which is why I said "undocumented" all along. I said you can find out how to *use* it through Google, and I was referring to the link above which *does* show a shader that uses it. – kusma May 21 '13 at 09:50

ahcox · Answer 3 · 2013-05-21T11:06:10.053

The upper bound on your fragment shader performance (lower bound on execution time) is set by 21 texture loads and one write to the framebuffer (gl_FragColor =). It would be worthwhile to construct a fragment shader that simply executes 21 loads, accumulating the result of each load into a single vec4 and then writing that out. If you run this shader on your troublesome target hardware, you will know the delta between where your more complex shader is and its maximum potential performance on those particular GPU / driver / platform revisions. Your real shader can only be slower than that, so if this simple test shader is itself too slow, you will have to look further afield for a solution.

Once that baseline is established, I have only vague advice for improving the shader that actually interests you but maybe my reasoning is of interest. I see that your code has all the texture loads bunched together at the top. At the hardware level, texture loads are of extremely long latency but GPU shader processors are able to do other things while they are in flight, including running other threads in the same block of work. This means broadly that a final shader binary which has lots of arithmetic work spread between the loads will do the arithmetic work for free in the shadow of the loads and also that a shader program using few registers will allow lots of threads to be run at the same time, each thread potentially doing its arithmetic work while other threads are blocked loading texels. Hopefully, any shader compiler will move your code around to achieve the required interleaving. However, it can't hurt to give it a hand, and thus:

Try to move each arithmetic statement up (lexically) in the file as high as it can go without breaking. This could help spread your loads out if the compiler has missed a trick.
Try to use all intermediate results as soon as possible so the compiler can recognise that their variables are dead and thus free up registers. This might reduce register usage and so increase your program's occupancy. One idea for achieving this effect, if you have a bunch of partial results that are summed at the end, would be to transform the final summation of many variable holding partial results into accumulating into a single variable as each partial result is generated.

As always with performance, YMMV

Regarding moving arithmetic statements - most of the arithmetic there is dependent on the fetched texels (except a few). Would you suggest spreading it evenly? (like: read texels 7, 11, 17, 133, computations, read texels 8, 6, 16, 18, computations and so on) — SirKnigget, May 20 '13 at 23:54

Optimizing performance of a heavy fragment shader

3 Answers3

Linked