gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

Question

(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix

void transpose(const size_t _n, double* _A) {
    for(uint i=0; i < _n; ++i) {
        for(uint j=i+1; j < _n; ++j) {
            double tmp  = _A[i*_n+j];
            _A[i*_n+j] = _A[j*_n+i];
            _A[j*_n+i] = tmp;
        }
    }
}

When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.

These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table

size n  time(us)
1020    2649
1021    2815
1022    3100
1023    5428
1024    15791
1025    6778
1026    3106
1027    2847
1028    2660
1029    3038
1030    2613

I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?

So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?

And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?

execution time vs matrix size for O0 and O3

EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)

EDIT: the following should be the assembler codes that gcc produced

no optimization (O0):

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    mov rbp, rsp
    .cfi_def_cfa_register 6
    mov QWORD PTR [rbp-24], rdi
    mov QWORD PTR [rbp-32], rsi
    mov DWORD PTR [rbp-4], 0
    jmp .L2
.L5:
    mov eax, DWORD PTR [rbp-4]
    add eax, 1
    mov DWORD PTR [rbp-8], eax
    jmp .L3
.L4:
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rax, rdx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-32]
    mov rcx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rsi
    sal rax, 3
    add rax, rcx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rdx], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-16]
    mov QWORD PTR [rdx], rax
    add DWORD PTR [rbp-8], 1
.L3:
    mov eax, DWORD PTR [rbp-8]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L4
    add DWORD PTR [rbp-4], 1
.L2:
    mov eax, DWORD PTR [rbp-4]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L5
    pop rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

with optimization (O3)

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    xor r11d, r11d
    xor ebx, ebx
.L2:
    cmp r11, rdi
    mov r9, r11
    jae .L10
    .p2align 4,,10
    .p2align 3
.L7:
    add ebx, 1
    mov r11d, ebx
    cmp rdi, r11
    mov rax, r11
    jbe .L2
    mov r10, r9
    mov r8, QWORD PTR [rsi]
    mov edx, ebx
    imul    r10, rdi
    .p2align 4,,10
    .p2align 3
.L6:
    lea rcx, [rax+r10]
    add edx, 1
    imul    rax, rdi
    lea rcx, [r8+rcx*8]
    movsd   xmm0, QWORD PTR [rcx]
    add rax, r9
    lea rax, [r8+rax*8]
    movsd   xmm1, QWORD PTR [rax]
    movsd   QWORD PTR [rcx], xmm1
    movsd   QWORD PTR [rax], xmm0
    mov eax, edx
    cmp rdi, rax
    ja  .L6
    cmp r11, rdi
    mov r9, r11
    jb  .L7
.L10:
    pop rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

The size of that function is nominal, I am more than a little curious what the produced asm code looks like in each case.If you can provide the concrete asm from both as-was-tested and produced your results, it would make a wonderful addition to your question. — WhozCraig, Feb 18 '14 at 01:19
@WhozCraig added the assembler codes. They were produced from a new seperate file, but with the identical compiler calls, so they should be identical to the code I used for the plot. — example, Feb 18 '14 at 01:31
What happens if you qualify the loop with [`#pragma GCC ivdep`](http://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas)? Not so much for SIMD optimizations, but to tell the compiler that each element transpose is independent. — Brett Hale, Feb 18 '14 at 01:39
@BrettHale That is a nice pragma (that I did not know so far). But unfortunately it is not available for gcc 4.8 (at least mine does not know it ^^) and discussions about it are new enough that this might be a feature of 4.9 . — example, Feb 18 '14 at 01:52
You've tagged this as 'c++', so why not replace the inner part of the loop with 'std::swap'? The power-of-2 is probably either a cache or page fault. For a more optimal approach to matrix transposition, see http://stackoverflow.com/a/16743203/257645 — kfsone, Feb 18 '14 at 01:53
@kfsone the inner loop starts at `i+1` so the diagonal elements (i=j) are never touched. I could use std::swap but that is actually implemented just as I have written it in the code (or very similar). Anyways, my goal is not to implement the best matrix transpose, but rather to understand this strange optimization behaviour. — example, Feb 18 '14 at 02:05
The power-of-two super-alignment slowdowns are pretty well known: http://stackoverflow.com/q/12264970/922184, http://stackoverflow.com/q/8547778/922184 How does it affect here I'm not sure. Perhaps the one without optimizations is being purturbed in a way that pushes away from super-alignment. — Mysticial, Feb 18 '14 at 04:50

Aki Suihkonen · Accepted Answer · 2017-08-11T06:22:25.437

The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.

The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.

Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.

This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010. In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:

 for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];

But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.

 // one possible method of optimization with 2 outputs and 6 inputs
 // with two unrelated execution paths -- should be faster, but maybe it isn't
 for (i=0;i<500;i++) { 
       a[i]     += b[i]     * c[i]     + d[i];
       a[i+500] += b[i+500] * c[i+500] + d[i+500];
 }

A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].

What one can do to combat this behavior is to allocate more than a necessary amount of memory to the array -- specifically `A[n][n+w]` to force subsequent rows not to compete with each other. OR you can split the transformation to blocks of (typically) 8x8 submatrices. — Aki Suihkonen, Feb 18 '14 at 07:41
The number of sources used does not change with increasing matrix size, so I assume that only the simple hash function that is used causes collisionsand thus removes cached data prematurely. Are these hash-functions publicly known, so that I may work around them? Are they different for different cpu (vendors)? Maybe I should check that same code on another architecture... — example, Feb 18 '14 at 11:49

score 0 · Answer 2 · answered Feb 18 '14 at 01:38

0

The improvement in performance is likely related to CPU/RAM caching.

When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.

I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!

answered Feb 18 '14 at 01:38

wallyk

56,922
16
83
148

1

"For a data set which is a power of 2, all of the pre-fetched data is used." - another reason why I would assume that powers of two are faster. But instead they are _much_ slower! Also wouldn't cache faults create more of a step-like pattern? Why does performance increase again after these evil-power-of-two's? – example Feb 18 '14 at 02:01

score 0 · Answer 3 · answered Feb 18 '14 at 04:04

Comment with code: In the -O3 case, with

#include <cstdlib>

extern void transpose(const size_t n, double* a)
{
    for (size_t i = 0; i < n; ++i) {
        for (size_t j = i + 1; j < n; ++j) {
            std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
        }
    }
}

compiling with

$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3

I get

_Z9transposemPd:
.LFB68:
    .cfi_startproc
.LBB2:
    testq   %rdi, %rdi
    je  .L1
    leaq    8(,%rdi,8), %r10
    xorl    %r8d, %r8d
.LBB3:
    addq    $1, %r8
    leaq    -8(%r10), %rcx
    cmpq    %rdi, %r8
    leaq    (%rsi,%rcx), %r9
    je  .L1
    .p2align 4,,10
    .p2align 3
.L10:
    movq    %r9, %rdx
    movq    %r8, %rax
    .p2align 4,,10
    .p2align 3
.L5:
.LBB4:
    movsd   (%rdx), %xmm1
    movsd   (%rsi,%rax,8), %xmm0
    movsd   %xmm1, (%rsi,%rax,8)
.LBE4:
    addq    $1, %rax
.LBB5:
    movsd   %xmm0, (%rdx)
    addq    %rcx, %rdx
.LBE5:
    cmpq    %rdi, %rax
    jne .L5
    addq    $1, %r8
    addq    %r10, %r9
    addq    %rcx, %rsi
    cmpq    %rdi, %r8
    jne .L10
.L1:
    rep ret
.LBE3:
.LBE2:
    .cfi_endproc

And something quite different if I add -m32.

(Note: it makes no difference to the assembly whether I use std::swap or your variant)

In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.

score 0 · Answer 4 · answered Feb 18 '14 at 06:42

To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :

__Z9transposemPd:
LFB0:
        testq   %rdi, %rdi
        je      L1
        leaq    8(,%rdi,8), %r10
        xorl    %r8d, %r8d
        leaq    -8(%r10), %rcx
        addq    $1, %r8
        leaq    (%rsi,%rcx), %r9
        cmpq    %rdi, %r8
        je      L1
        .align 4,0x90
L10:
        movq    %r9, %rdx
        movq    %r8, %rax
        .align 4,0x90
L5:
        movsd   (%rdx), %xmm0
        movsd   (%rsi,%rax,8), %xmm1
        movsd   %xmm0, (%rsi,%rax,8)
        addq    $1, %rax
        movsd   %xmm1, (%rdx)
        addq    %rcx, %rdx
        cmpq    %rdi, %rax
        jne     L5
        addq    $1, %r8
        addq    %r10, %r9
        addq    %rcx, %rsi
        cmpq    %rdi, %r8
        jne     L10
L1:
        rep; ret

Basically the same as @ksfone's code, for:

#include <cstddef>

void transpose(const size_t _n, double* _A) {
    for(size_t i=0; i < _n; ++i) {
        for(size_t j=i+1; j < _n; ++j) {
            double tmp  = _A[i*_n+j];
            _A[i*_n+j] = _A[j*_n+i];
            _A[j*_n+i] = tmp;
        }
    }
}

Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.

gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

4 Answers4

Linked