Calculate 4d vectors average with SSE

Question

I try to speed up calculation of average of 4d vectors placed in an array. Here is my code:

#include <sys/time.h>
#include <sys/param.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>

typedef float dot[4];
#define N 1000000

double gettime ()
{
    struct timeval tv;
    gettimeofday (&tv, 0);
    return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}

void calc_avg1 (dot res, const dot array[], int n)
{
    int i,j;
    memset (res, 0, sizeof (dot));
    for (i = 0; i < n; i++)
    {
        for (j = 0; j<4; j++) res[j] += array[i][j];
    }
    for (j = 0; j<4; j++) res[j] /= n;
}

void calc_avg2 (dot res, const dot array[], int n)
{
    int i;
    __v4sf r = _mm_set1_ps (0.0);
    for (i=0; i<n; i++) r += _mm_load_ps (array[i]);
    r /= _mm_set1_ps ((float)n);
    _mm_store_ps (res, r);
}

int main ()
{
    void *space = malloc (N*sizeof(dot)+15);
    dot *array = (dot*)(((unsigned long)space+15) & ~(unsigned long)15);
    dot avg __attribute__((aligned(16)));
    int i;
    double time;

    for (i = 0; i < N; i++)
    {
        array[i][0] = 1.0*random();
        array[i][1] = 1.0*random();
        array[i][2] = 1.0*random();
    }
    time = gettime();
    calc_avg1 (avg, array, N);
    time = gettime() - time;
    printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);

    time = gettime();
    calc_avg2 (avg, array, N);
    time = gettime() - time;
    printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);
    return 0;
}

So as you can see calc_avg1 uses naive loops from 0 to 4 and calc_avg2 replaces them with SSE instructions. I compile this code with clang 3.4:

cc -O2 -o test test.c

Here is disassemble of calc_avgX functions:

0000000000400860 <calc_avg1>:
  400860:   55                      push   %rbp
  400861:   48 89 e5                mov    %rsp,%rbp
  400864:   85 d2                   test   %edx,%edx
  400866:   0f 57 c0                xorps  %xmm0,%xmm0
  400869:   0f 11 07                movups %xmm0,(%rdi)
  40086c:   7e 42                   jle    4008b0 <calc_avg1+0x50>
  40086e:   48 83 c6 0c             add    $0xc,%rsi
  400872:   0f 57 c0                xorps  %xmm0,%xmm0
  400875:   89 d0                   mov    %edx,%eax
  400877:   0f 57 c9                xorps  %xmm1,%xmm1
  40087a:   0f 57 d2                xorps  %xmm2,%xmm2
  40087d:   0f 57 db                xorps  %xmm3,%xmm3
  400880:   f3 0f 58 5e f4          addss  -0xc(%rsi),%xmm3
  400885:   f3 0f 11 1f             movss  %xmm3,(%rdi)
  400889:   f3 0f 58 56 f8          addss  -0x8(%rsi),%xmm2
  40088e:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)
  400893:   f3 0f 58 4e fc          addss  -0x4(%rsi),%xmm1
  400898:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)
  40089d:   f3 0f 58 06             addss  (%rsi),%xmm0
  4008a1:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)
  4008a6:   48 83 c6 10             add    $0x10,%rsi
  4008aa:   ff c8                   dec    %eax
  4008ac:   75 d2                   jne    400880 <calc_avg1+0x20>
  4008ae:   eb 0c                   jmp    4008bc <calc_avg1+0x5c>
  4008b0:   0f 57 c0                xorps  %xmm0,%xmm0
  4008b3:   0f 57 c9                xorps  %xmm1,%xmm1
  4008b6:   0f 57 d2                xorps  %xmm2,%xmm2
  4008b9:   0f 57 db                xorps  %xmm3,%xmm3
  4008bc:   f3 0f 2a e2             cvtsi2ss %edx,%xmm4
  4008c0:   f3 0f 5e dc             divss  %xmm4,%xmm3
  4008c4:   f3 0f 11 1f             movss  %xmm3,(%rdi)
  4008c8:   f3 0f 5e d4             divss  %xmm4,%xmm2
  4008cc:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)
  4008d1:   f3 0f 5e cc             divss  %xmm4,%xmm1
  4008d5:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)
  4008da:   f3 0f 5e c4             divss  %xmm4,%xmm0
  4008de:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)
  4008e3:   5d                      pop    %rbp
  4008e4:   c3                      retq   
  4008e5:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  4008ec:   00 00 00 00 

00000000004008f0 <calc_avg2>:
  4008f0:   55                      push   %rbp
  4008f1:   48 89 e5                mov    %rsp,%rbp
  4008f4:   85 d2                   test   %edx,%edx
  4008f6:   0f 57 c0                xorps  %xmm0,%xmm0
  4008f9:   7e 10                   jle    40090b <calc_avg2+0x1b>
  4008fb:   89 d0                   mov    %edx,%eax
  4008fd:   0f 1f 00                nopl   (%rax)
  400900:   0f 58 06                addps  (%rsi),%xmm0
  400903:   48 83 c6 10             add    $0x10,%rsi
  400907:   ff c8                   dec    %eax
  400909:   75 f5                   jne    400900 <calc_avg2+0x10>
  40090b:   66 0f 6e ca             movd   %edx,%xmm1
  40090f:   66 0f 70 c9 00          pshufd $0x0,%xmm1,%xmm1
  400914:   0f 5b c9                cvtdq2ps %xmm1,%xmm1
  400917:   0f 5e c1                divps  %xmm1,%xmm0
  40091a:   0f 29 07                movaps %xmm0,(%rdi)
  40091d:   5d                      pop    %rbp
  40091e:   c3                      retq   
  40091f:   90                      nop

And here is the result:

> ./test
0.004287
1073864320.000000 1074018048.000000 1073044224.000000
0.003661
1073864320.000000 1074018048.000000 1073044224.000000

So SSE version is 1.17 times faster. But when I try to do seemingly the same job, which is to calculate average of single precision scalars in an array (as described here SSE reduction of float vector for example), SSE version runs 3.32 times faster. Here is the code of calc_avgX functions:

float calc_avg1 (const float array[], int n)
{
    int i;
    float avg = 0;
    for (i = 0; i < n; i++) avg += array[i];
    return avg / n;
}

float calc_avg3 (const float array[], int n)
{
    int i;
    __v4sf r = _mm_set1_ps (0.0);
    for (i=0; i<n; i+=4) r += _mm_load_ps (&(array[i]));
    r = _mm_hadd_ps (r, r);
    r = _mm_hadd_ps (r, r);
    return r[0] / n;
}

So my question is this: Why I benefit from SSE so much in the last example (calculation of average of single float scalars) and do not in the first (calculation of average of 4d vectors)? For me, these jobs are almost the same. What is the right way to speed up calculations in the first example, if I do it wrong?

UPD: If you think it's relevant, I also supply disassembly of the second example, where average of scalars is calculated (also compiled with clang3.4 -O2).

0000000000400860 <calc_avg1>:
  400860:   55                      push   %rbp
  400861:   48 89 e5                mov    %rsp,%rbp
  400864:   85 d2                   test   %edx,%edx
  400866:   0f 57 c0                xorps  %xmm0,%xmm0
  400869:   0f 11 07                movups %xmm0,(%rdi)
  40086c:   7e 42                   jle    4008b0 <calc_avg1+0x50>
  40086e:   48 83 c6 0c             add    $0xc,%rsi
  400872:   0f 57 c0                xorps  %xmm0,%xmm0
  400875:   89 d0                   mov    %edx,%eax
  400877:   0f 57 c9                xorps  %xmm1,%xmm1
  40087a:   0f 57 d2                xorps  %xmm2,%xmm2
  40087d:   0f 57 db                xorps  %xmm3,%xmm3
  400880:   f3 0f 58 5e f4          addss  -0xc(%rsi),%xmm3
  400885:   f3 0f 11 1f             movss  %xmm3,(%rdi)
  400889:   f3 0f 58 56 f8          addss  -0x8(%rsi),%xmm2
  40088e:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)
  400893:   f3 0f 58 4e fc          addss  -0x4(%rsi),%xmm1
  400898:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)
  40089d:   f3 0f 58 06             addss  (%rsi),%xmm0
  4008a1:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)
  4008a6:   48 83 c6 10             add    $0x10,%rsi
  4008aa:   ff c8                   dec    %eax
  4008ac:   75 d2                   jne    400880 <calc_avg1+0x20>
  4008ae:   eb 0c                   jmp    4008bc <calc_avg1+0x5c>
  4008b0:   0f 57 c0                xorps  %xmm0,%xmm0
  4008b3:   0f 57 c9                xorps  %xmm1,%xmm1
  4008b6:   0f 57 d2                xorps  %xmm2,%xmm2
  4008b9:   0f 57 db                xorps  %xmm3,%xmm3
  4008bc:   f3 0f 2a e2             cvtsi2ss %edx,%xmm4
  4008c0:   f3 0f 5e dc             divss  %xmm4,%xmm3
  4008c4:   f3 0f 11 1f             movss  %xmm3,(%rdi)
  4008c8:   f3 0f 5e d4             divss  %xmm4,%xmm2
  4008cc:   f3 0f 11 57 04          movss  %xmm2,0x4(%rdi)
  4008d1:   f3 0f 5e cc             divss  %xmm4,%xmm1
  4008d5:   f3 0f 11 4f 08          movss  %xmm1,0x8(%rdi)
  4008da:   f3 0f 5e c4             divss  %xmm4,%xmm0
  4008de:   f3 0f 11 47 0c          movss  %xmm0,0xc(%rdi)
  4008e3:   5d                      pop    %rbp
  4008e4:   c3                      retq   
  4008e5:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  4008ec:   00 00 00 00 

00000000004008d0 <calc_avg3>:
  4008d0:   55                      push   %rbp
  4008d1:   48 89 e5                mov    %rsp,%rbp
  4008d4:   31 c0                   xor    %eax,%eax
  4008d6:   85 f6                   test   %esi,%esi
  4008d8:   0f 57 c0                xorps  %xmm0,%xmm0
  4008db:   7e 0f                   jle    4008ec <calc_avg3+0x1c>
  4008dd:   0f 1f 00                nopl   (%rax)
  4008e0:   0f 58 04 87             addps  (%rdi,%rax,4),%xmm0
  4008e4:   48 83 c0 04             add    $0x4,%rax
  4008e8:   39 f0                   cmp    %esi,%eax
  4008ea:   7c f4                   jl     4008e0 <calc_avg3+0x10>
  4008ec:   66 0f 70 c8 01          pshufd $0x1,%xmm0,%xmm1
  4008f1:   f3 0f 58 c8             addss  %xmm0,%xmm1
  4008f5:   66 0f 70 d0 03          pshufd $0x3,%xmm0,%xmm2
  4008fa:   0f 12 c0                movhlps %xmm0,%xmm0
  4008fd:   f3 0f 58 c1             addss  %xmm1,%xmm0
  400901:   f3 0f 58 c2             addss  %xmm2,%xmm0
  400905:   0f 57 c9                xorps  %xmm1,%xmm1
  400908:   f3 0f 2a ce             cvtsi2ss %esi,%xmm1
  40090c:   f3 0f 5e c1             divss  %xmm1,%xmm0
  400910:   5d                      pop    %rbp
  400911:   c3                      retq   
  400912:   66 66 66 66 66 2e 0f    nopw   %cs:0x0(%rax,%rax,1)
  400919:   1f 84 00 00 00 00 00

1) The first thing that comes to my mind is precision: you're gonna get wrong results if you just keep adding a million numbers and then divide them. But it's definitely faster... :) 2) I guess you compile the second example also with -O2. If you show us the generated assembly, we could work out whether the compiler does a worse job on the second algorithm. — bremby, Jul 10 '15 at 07:57
3) You do not call srandom(). There may be some numerical anomaly that keeps influencing your results. 4) The running time seems very short (thousands of microseconds). This does not seem to be enough to tell apart the speedups. The OS may interrupt your code and may make it take a different time to compute. Add more iterations. — bremby, Jul 10 '15 at 08:05
@bremby, If I call srandom (time (NULL)) and increase N (the number of loops) in the first example, result is the same -- SSE version is only about 1.2 times faster. I also provide disassembly for the second example, if you think it's helpful. — shamaz.mazum, Jul 10 '15 at 09:23

Peter Cordes · Accepted Answer · 2015-07-11T21:35:27.957

Sorry this answer got a little long and rambling. I ran some benchmarks, but I didn't take a long time editing earlier stuff after thinking of something else to try.

Your working set is 15.25MiB (16MB). Normally to benchmark a routine like this, you'd average a smaller buffer multiple times, so it fits in the cache. You don't see much difference between the slow version and the fast version because the diff is hidden by the memory bottleneck.

calc_avg1 doesn't auto-vectorize at all (note the addss. ss means scalar, single-precision, as opposed to addps (packed single-precision)). I think it can't autovectorize even when inlined into main because it can't be sure there aren't NaNs in the 4th vector position, which would cause an FP exception that the scalar code wouldn't have. I tried compiling it for Sandybridge with gcc 4.9.2 -O3 -march=native -ffast-math, and with clang-3.5, but no luck with either.

Even so, the version inlined into main only runs slightly slower, because memory is the bottleneck. 32bit loads can nearly keep up with 128b loads, when hitting main memory. (The non-inlined version will be bad, though: Every += result is stored to the res array, because the loop accumulates directly into memory that might have other references to it. So it has to make every operation visible with a store. This is the version you posted the disassembly for, BTW. Sorting out which part of main was which was assisted by compiling with -S -fverbose-asm.)

Somewhat disappointingly, clang and gcc can't auto-vectorize the __v4sf from 4-wide to 8-wide AVX.

Scratch that, after wrapping a for (int i=0; i<4000 ; i++) around the calls to calc_avgX, and reducing N to 10k, gcc -O3 turns the inner inner-loop of avg1 into:

  400690:       c5 f8 10 08             vmovups (%rax),%xmm1
  400694:       48 83 c0 20             add    $0x20,%rax
  400698:       c4 e3 75 18 48 f0 01    vinsertf128 $0x1,-0x10(%rax),%ymm1,%ymm1
  40069f:       c5 fc 58 c1             vaddps %ymm1,%ymm0,%ymm0
  4006a3:       48 39 d8                cmp    %rbx,%rax
  4006a6:       75 e8                   jne    400690 <main+0xe0>

$ (get CPU to max-turbo frequency) && time ./a.out
0.016515
1071570752.000000 1066917696.000000 1073897344.000000
0.032875
1071570944.000000 1066916416.000000 1073895680.000000

This is bizzare; I have no idea why it doesn't just use 32B loads. It does use 32B vaddps, which is the bottleneck when working with a data set that fits in L2 cache.

IDK why it managed to auto-vectorize the inner loop when it was inside another loop. Note that this only applies to the version inlined into main. The callable version is still scalar-only. Also note that only gcc managed this. clang 3.5 didn't. Maybe gcc knew that it was going to use malloc in a way that returned a zeroed buffer (so it didn't have to worry about NaNs in the 4th element)?

I'm also surprised that clang's non-vectorized avg1 is not slower, when everything fits in cache. N=10000, repeat-count = 40k.

3.3GHz SNB i5 2500k, max turbo = 3.8GHz.
avg1: 0.350422s:  clang -O3 -march=native (not vectorized.  loop of 6 scalar addss with memory operands)
avg2: 0.320173s:  clang -O3 -march=native
avg1: 0.497040s:  clang -O3 -march=native -ffast-math (haven't looked at asm to see what happened)

avg1: 0.160374s:  gcc -O3 -march=native (256b addps, with 2 128b loads)
avg2: 0.321028s:  gcc -O3 -march=native (128b addps with a memory operand)

avg2: ~0.16:  clang, unrolled with 2 dependency chains to hide latency (see below).
avg2: ~0.08:  unrolled with 4 dep chains
avg2: ~0.04:  in theory unrolled-by-4 with 256b AVX.  I didn't try unrolling the one gcc auto-vectorized with 256b addps

So the big surprise is that the scalar-only clang avg1 code keeps up with avg2. Perhaps the loop-carried dependency chain is the bigger bottleneck?

perf is showing 1.47 insns per cycle for clang's non-vectorized avg1, which is probably saturating the FP add unit on port 1. (Most of the loop instructions are adds).

However, avg2, using 128b addps with a memory operand, is only getting 0.58 insns per cycle. Reducing the array size by another factor of 10, to N=1000, gets 0.60 insns per cycle, probably because of more time spend in prologue/epilogue. So there's a serious issue with the loop-carried dependency chain, I think. clang unrolls the loop by 4, but only uses a single accumulator. The loop has 7 instructions, which decodes to 10 uops. (Each vaddps is 2, since it's used with a memory operand with a 2-register addressing mode, preventing micro-fusion. The cmp and jne macro-fuse). http://www.brendangregg.com/perf.html says the perf event for UOPS_DISPATCHED.CORE is r2b1, so:

$ perf stat -d -e cycles,instructions,r2b1 ./a.out
0.031793
1053298112.000000 1052673664.000000 1116960256.000000

 Performance counter stats for './a.out':

       118,453,541      cycles
        71,181,299      instructions              #    0.60  insns per cycle
       102,025,443      r2b1  # this is uops, but perf doesn't have a nice name for it
        40,256,019      L1-dcache-loads
            21,254      L1-dcache-load-misses     #    0.05% of all L1-dcache hits
             9,588      LLC-loads
                 0      LLC-load-misses:HG        #    0.00% of all LL-cache hits

       0.032276233 seconds time elapsed

This confirms my 7:10 instructions to uops analysis. That's actually not relevant to the performance problem here: The loop is running way slower than the 4 uops per cycle upper limit. Changing the inner loop to get two separate dep chains going doubles the throughput (60M cycles instead of 117M, but 81M insns instead of 71M):

for (i=0; i<n-1; i+=2) {  // TODO: make sure the loop end condition is correct
   r0 += _mm_load_ps (array[i]);
   r1 += _mm_load_ps (array[i+1]);
}
r0 += r1;

Unrolling by 4 (with 4 accumulators that you merge at the end of the loop) doubles the performance again. (down to 42M cycles, 81M insns, 112M uops.) The inner loop has 4x vaddps -0x30(%rcx),%xmm4,%xmm4 (and similar), 2x add, cmp, jl. This form of vaddps should micro-fuse, but I'm still seeing a lot more uops than instructions, so I guess r2b1 counts unfused-domain uops. (Linux perf doesn't have good docs for platform-specific HW events). Cranking up N again, to make sure it's the innermost loop that completely dominates all the counts, I see a uop:insn ratio of 1.39, which matches well with 8 insns, 11 uops (1.375) (counting vaddps as 2, but counting cmp + jl as one). I found http://www.bnikolic.co.uk/blog/hpc-prof-events.html, which has a full list of supported perf events, including their codes for Sandybridge. (And instructions on how to dump the table for any other CPU). (Look for the Code: line in each block. You need a umask byte, and then the code, as an arg to perf.)

# a.out does only avg2, as an unrolled-by-4 version.
$ perf stat -d -e cycles,instructions,r14a1,r2b1,r10e,r2c2,r1c2 ./a.out
0.011331
1053298752.000000 1052674496.000000 1116959488.000000

 Performance counter stats for './a.out':

        42,250,312      cycles                    [34.11%]
        56,103,429      instructions              #    1.33  insns per cycle
        20,864,416      r14a1 # UOPS_DISPATCHED_PORT: 0x14=port2&3 loads
       111,943,380      r2b1 # UOPS_DISPATCHED: (2->umask 00 -> this core, any thread).
        72,208,772      r10e # UOPS_ISSUED: fused-domain
        71,422,907      r2c2 # UOPS_RETIRED: retirement slots used (fused-domain)
       111,597,049      r1c2 # UOPS_RETIRED: ALL (unfused-domain)
                 0      L1-dcache-loads
            18,470      L1-dcache-load-misses     #    0.00% of all L1-dcache hits
             5,717      LLC-loads                                                    [66.05%]
                 0      LLC-load-misses:HG        #    0.00% of all LL-cache hits

       0.011920301 seconds time elapsed

So yeah, it looks like this can count fused-domain and unfused-domain uops!

Unrolling by 8 doesn't help at all: still 42M cycles. (but down to 61M insns, and 97M uops, thanks to less loop overhead). Neat, clang uses sub $-128, %rsi instead of add, because -128 fits in an imm8, but +128 doesn't. So I guess unrolling by 4 is enough to saturate the FP add port.

As for your 1avg functions that return a single float, rather than a vector, well clang didn't auto-vectorize the first one, but gcc does. It emits a giant prologue and epilogue to do scalar sums until it gets to an aligned address, then does 32B AVX vaddps in a small loop. You say you found a much bigger speed diff with them, but were you maybe testing with a smaller buffer? That would account for seeing a big speedup for vector code vs. non-vector.

Thank you for such a good answer. So it is loop unrolling after all. I wrote another [test](http://pastebin.com/6t9Sa5DV) for my second example (average of scalars). `calc_avg2` works almost as fast as `calc_avg3`, but there is no vector addition (`addps`) there. [Here](http://pastebin.com/0qr0Mawz) is disassembly. I suspected it before and checked it, but used one accumulator instead of 4, which is the wrong way as I know now. BTW, Clang 3.4 does auto-vectorization for `calc_avg1` if you also specify -ffast-math. I think just -O2 is enough to auto-vectorize integer math. — shamaz.mazum, Jul 11 '15 at 11:02
Give your scalar-average functions different names, like `scalar_avg2`. It's only going to run anywhere near the same speed at `scalar_avg3` if you test with a buffer that's way too big, so memory bandwidth is the bottleneck for everything. `addps` takes the same amount of CPU execution resources as `addss`, but does 4 (or 8) times as much work. For your real code that uses these functions, look into "cache blocking" to structure your algorithm to do everything it usefully can with a chunk of memory while it's in the cache. — Peter Cordes, Jul 11 '15 at 18:51
Maybe in the future, compilers will be smart enough to use multiple accumulators when they unroll. Since FP math has limited precision, the results won't always be identical (just like the difference between doing vector adds with a horizontal sum at the end). So compilers will need to be told it's ok (`-ffast-math` or something). — Peter Cordes, Jul 11 '15 at 18:54

Calculate 4d vectors average with SSE

1 Answers1

Linked