Why sin/cos are slower when optimizations are enabled?

Question

After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.

#include <cmath>
#include <cstdio>

const int N = 4000;

float cosine[N][N];
float sine[N][N];

int main() {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float ang = i*j*2*M_PI/N;
            cosine[i][j] = cos(ang);
            sine[i][j] = sin(ang);
        }
    }
}

With the above code I get:

With -O0: 2.402s

With -O1: 9.004s

With -O2: 9.013s

With -O3: 9.001s

Now if I change

float ang = i*j*2*M_PI/N;

To

double ang = i*j*2*M_PI/N;

I get:

With -O0: 2.362s

With -O1: 1.188s

With -O2: 1.197s

With -O3: 1.197s

How can the first test be that faster without optimizations?

I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.

EDIT: Changed the title to better describe the problem.

EDIT: Added assembly code

Assembly for first test with O0:

    .file   "main.cpp"
.globl cosine
    .bss
    .align 32
    .type   cosine, @object
    .size   cosine, 64000000
cosine:
    .zero   64000000
.globl sine
    .align 32
    .type   sine, @object
    .size   sine, 64000000
sine:
    .zero   64000000
    .text
.globl main
    .type   main, @function
main:
.LFB87:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    movq    %rsp, %rbp
    .cfi_offset 6, -16
    .cfi_def_cfa_register 6
    subq    $16, %rsp
    movl    $0, -4(%rbp)
    jmp .L2
.L5:
    movl    $0, -8(%rbp)
    jmp .L3
.L4:
    movl    -4(%rbp), %eax
    imull   -8(%rbp), %eax
    addl    %eax, %eax
    cvtsi2sd    %eax, %xmm0
    movsd   .LC0(%rip), %xmm1
    mulsd   %xmm1, %xmm0
    movsd   .LC1(%rip), %xmm1
    divsd   %xmm1, %xmm0
    unpcklpd    %xmm0, %xmm0
    cvtpd2ps    %xmm0, %xmm0
    movss   %xmm0, -12(%rbp)
    movss   -12(%rbp), %xmm0
    cvtps2pd    %xmm0, %xmm0
    call    cos
    unpcklpd    %xmm0, %xmm0
    cvtpd2ps    %xmm0, %xmm0
    movl    -8(%rbp), %eax
    cltq
    movl    -4(%rbp), %edx
    movslq  %edx, %rdx
    imulq   $4000, %rdx, %rdx
    leaq    (%rdx,%rax), %rax
    movss   %xmm0, cosine(,%rax,4)
    movss   -12(%rbp), %xmm0
    cvtps2pd    %xmm0, %xmm0
    call    sin
    unpcklpd    %xmm0, %xmm0
    cvtpd2ps    %xmm0, %xmm0
    movl    -8(%rbp), %eax
    cltq
    movl    -4(%rbp), %edx
    movslq  %edx, %rdx
    imulq   $4000, %rdx, %rdx
    leaq    (%rdx,%rax), %rax
    movss   %xmm0, sine(,%rax,4)
    addl    $1, -8(%rbp)
.L3:
    cmpl    $3999, -8(%rbp)
    setle   %al
    testb   %al, %al
    jne .L4
    addl    $1, -4(%rbp)
.L2:
    cmpl    $3999, -4(%rbp)
    setle   %al
    testb   %al, %al
    jne .L5
    movl    $0, %eax
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE87:
    .size   main, .-main
    .section    .rodata
    .align 4
    .type   _ZL1N, @object
    .size   _ZL1N, 4
_ZL1N:
    .long   4000
    .align 8
.LC0:
    .long   1413754136
    .long   1074340347
    .align 8
.LC1:
    .long   0
    .long   1085227008
    .ident  "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
    .section    .note.GNU-stack,"",@progbits

Assembly for first test with O3:

    .file   "main.cpp"
    .text
    .p2align 4,,15
.globl main
    .type   main, @function
main:
.LFB121:
    .cfi_startproc
    pushq   %r15
    .cfi_def_cfa_offset 16
    xorl    %r15d, %r15d
    .cfi_offset 15, -16
    pushq   %r14
    .cfi_def_cfa_offset 24
    movl    $cosine+16000, %r14d
    .cfi_offset 14, -24
    pushq   %r13
    .cfi_def_cfa_offset 32
    xorl    %r13d, %r13d
    .cfi_offset 13, -32
    pushq   %r12
    .cfi_def_cfa_offset 40
    pushq   %rbp
    .cfi_def_cfa_offset 48
    pushq   %rbx
    .cfi_def_cfa_offset 56
    subq    $24, %rsp
    .cfi_def_cfa_offset 80
    .p2align 4,,10
    .p2align 3
.L2:
    movslq  %r15d, %rbp
    .cfi_offset 3, -56
    .cfi_offset 6, -48
    .cfi_offset 12, -40
    movl    %r13d, %r12d
    movl    $0x3f800000, %edx
    imulq   $16000, %rbp, %rbp
    xorl    %eax, %eax
    leaq    cosine(%rbp), %rbx
    addq    $sine, %rbp
    jmp .L5
    .p2align 4,,10
    .p2align 3
.L3:
    movl    %r12d, %eax
    leaq    8(%rsp), %rsi
    leaq    12(%rsp), %rdi
    subl    %r13d, %eax
    cvtsi2sd    %eax, %xmm0
    mulsd   .LC2(%rip), %xmm0
    divsd   .LC3(%rip), %xmm0
    unpcklpd    %xmm0, %xmm0
    cvtpd2ps    %xmm0, %xmm0
    call    sincosf
    movl    8(%rsp), %edx
    movl    12(%rsp), %eax
.L5:
    movl    %edx, (%rbx)
    addq    $4, %rbx
    movl    %eax, 0(%rbp)
    addl    %r13d, %r12d
    addq    $4, %rbp
    cmpq    %r14, %rbx
    jne .L3
    addl    $1, %r15d
    addl    $2, %r13d
    leaq    16000(%rbx), %r14
    cmpl    $4000, %r15d
    jne .L2
    addq    $24, %rsp
    .cfi_def_cfa_offset 56
    xorl    %eax, %eax
    popq    %rbx
    .cfi_def_cfa_offset 48
    popq    %rbp
    .cfi_def_cfa_offset 40
    popq    %r12
    .cfi_def_cfa_offset 32
    popq    %r13
    .cfi_def_cfa_offset 24
    popq    %r14
    .cfi_def_cfa_offset 16
    popq    %r15
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE121:
    .size   main, .-main
.globl cosine
    .bss
    .align 32
    .type   cosine, @object
    .size   cosine, 64000000
cosine:
    .zero   64000000
.globl sine
    .align 32
    .type   sine, @object
    .size   sine, 64000000
sine:
    .zero   64000000
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC2:
    .long   1413754136
    .long   1074340347
    .align 8
.LC3:
    .long   0
    .long   1085227008
    .ident  "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
    .section    .note.GNU-stack,"",@progbits

Your question would be easier to answer if you provided the generated assembly code for each case. Use the `-S` option to gcc to create an assembly listing. — Greg Hewgill, Aug 08 '11 at 04:25
@fbafelipe: Well something obvious is that the compiler is using `xmm1` with -O0, but not with -O3. Beats me as to why, though... — user541686, Aug 08 '11 at 04:41
I got surprising results today with your small benchmark code and experimenting with the functions ::sin, ::sinf, std::sin, sincos, sincosf. Depending on Processor (ARM64 vs AMD64) and compiler flags (-O0 vs -O3, -ffast-math or not) the runtime varied greatly (factor 2 to 5) between comparable versions. Short conclusion: Measure! — FelEnd, Dec 03 '19 at 16:07

score 5 · Accepted Answer · answered Aug 08 '11 at 04:36

5

Here's a possibility:

In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.

You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.

Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.

This shouldn't account for a 9x performance difference, though.

answered Aug 08 '11 at 04:36

Cory Nelson

29,236
5
72
110

I changed the calls to cosf and sinf, with O0 it gone to 17.198s and with O3 it was the same (8.999s). Checking the assembly I posted in the question, it shows a call to sincosf (instead of sincos - note that it used a function that compute sin and cos at the same time). So it seems that with optimization the compiler decided to change to a slower fuction instead of doing the cast... – fbafelipe Aug 08 '11 at 05:31
http://www.gnu.org/s/hello/manual/libc/FP-Function-Optimizations.html - does using __NO_MATH_INLINES remove this behaviour? – andrew cooke Aug 08 '11 at 12:24
@andrew cooke: No, same results with or without __NO_MATH_INLINES. – fbafelipe Aug 08 '11 at 16:29
The conversions between `float` and `double` do not account for it. I ran some tests today with g++ and found that when using `-O2` the `float` code was much slower. However, when I tested with manual conversions, like this: `(float)sin((double)input)` I found that the optimized `float` code ran _faster_ than the optimized `double` code, even though I was forcing the `float` code to use the `double` `sin` function. – Kyle A Jul 07 '17 at 01:33

user541686 · Answer 2 · 2011-08-08T04:44:19.213

2

AFAIK it's because computers work at double precision natively. Using float requires conversions.'

edited Aug 08 '11 at 04:44

answered Aug 08 '11 at 04:01

user541686

205,094
128
528
886

But why it's faster with -O0 than with -O3? This performance problem only happen when optimization is enabled. – fbafelipe Aug 08 '11 at 04:04
@fbafelipe : Optimizing for size? That would make the most sense. Usually size comes at a cost. – Aug 08 '11 at 04:25
2

@fbafelipe: The only way to answer that question is to disassemble the compiled program with and without optimizations, as was done in the question you linked to. – Peter O. Aug 08 '11 at 04:26
1

@Ethan: But -Os is for size optimization; -O3 is supposed to be speed at the cost of size, with (for example) inlining. – Dave Aug 08 '11 at 05:17

Why sin/cos are slower when optimizations are enabled?

2 Answers2