3

Someone told me the factor of speed differences is about 40x. Curious, I wrote a benchmark. It required some help from greybeards who were more knowledgeable about what might be optimizing out, but after several revisions we just cannot find a meaningful difference between divsd and mulsd.

Here's results:

➜  linked gcc main.c && ./a.out 
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 2080434
mul: 1925889
div / mul: 1.080246
0.000000

And with O3:

➜  linked gcc main.c -O3 && ./a.out
0.0000000000000000 4.6593234399298842
0.0000000000000000 4.6593234399298842
div: 1948388
mul: 1804587
div / mul: 1.079686
0.000000

The code:

#include <time.h>
#include <stdio.h>

#define TRIALS 10000000

int main() {
    double op = 0.0;
    double x = 1.0;
    double const y = 1.21462343468798723984729;
    time_t div_start = clock();
    for (size_t i = 0; i < TRIALS; i++) {
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
        x /= y;
        op += x;
    }
    time_t div_end = clock();
    printf("%.16f %.16f\n", x, op);
    time_t div_seconds = div_end - div_start;
    double op2 = 0.0;
    x = 1.0;
    double const z = 1 / y;
    time_t mul_start = clock();
    for (size_t i = 0; i < TRIALS; i++) {
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
        x *= z;
        op2 += x;
    }
    time_t mul_end = clock();
    time_t mul_seconds = mul_end - mul_start;
    printf("%.16f %.16f\n", x, op2);

    // print results as seconds
    printf("div: %ld\nmul: %ld\n", div_seconds, mul_seconds);
    printf("div / mul: %f\n", (double)div_seconds / (double)mul_seconds);
    printf("%f\n", op - op2);

    return 0;
}

The assembly with O3:

    .file   "main.c"
    .text
    .section    .rodata.str1.1,"aMS",@progbits,1
.LC3:
    .string "%.16f %.16f\n"
.LC5:
    .string "div: %ld\nmul: %ld\n"
.LC6:
    .string "div / mul: %f\n"
.LC7:
    .string "%f\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4
    .globl  main
    .type   main, @function
main:
.LFB11:
    .cfi_startproc
    pushq   %r13
    .cfi_def_cfa_offset 16
    .cfi_offset 13, -16
    pushq   %r12
    .cfi_def_cfa_offset 24
    .cfi_offset 12, -24
    pushq   %rbp
    .cfi_def_cfa_offset 32
    .cfi_offset 6, -32
    pushq   %rbx
    .cfi_def_cfa_offset 40
    .cfi_offset 3, -40
    subq    $40, %rsp
    .cfi_def_cfa_offset 80
    call    clock@PLT
    movq    .LC0(%rip), %rcx
    pxor    %xmm2, %xmm2
    movsd   .LC2(%rip), %xmm1
    movq    %rax, %rbp
    movl    $10000000, %eax
    movq    %rcx, %xmm0
    .p2align 4,,10
    .p2align 3
.L2:
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    divsd   %xmm1, %xmm0
    addsd   %xmm0, %xmm2
    subq    $1, %rax
    jne .L2
    movsd   %xmm2, 8(%rsp)
    leaq    .LC3(%rip), %r13
    movsd   %xmm0, 16(%rsp)
    call    clock@PLT
    movsd   8(%rsp), %xmm2
    movsd   16(%rsp), %xmm0
    movq    %r13, %rdi
    movq    %rax, %rbx
    movl    $2, %eax
    movapd  %xmm2, %xmm1
    subq    %rbp, %rbx
    call    printf@PLT
    call    clock@PLT
    movq    .LC0(%rip), %rdx
    movsd   8(%rsp), %xmm2
    pxor    %xmm1, %xmm1
    movsd   .LC4(%rip), %xmm3
    movq    %rax, %r12
    movl    $10000000, %eax
    movq    %rdx, %xmm0
    .p2align 4,,10
    .p2align 3
.L3:
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    mulsd   %xmm3, %xmm0
    addsd   %xmm0, %xmm1
    subq    $1, %rax
    jne .L3
    movsd   %xmm2, 24(%rsp)
    movsd   %xmm1, 8(%rsp)
    movsd   %xmm0, 16(%rsp)
    call    clock@PLT
    movsd   8(%rsp), %xmm1
    movsd   16(%rsp), %xmm0
    movq    %r13, %rdi
    subq    %r12, %rax
    movq    %rax, %rbp
    movl    $2, %eax
    call    printf@PLT
    movq    %rbp, %rdx
    movq    %rbx, %rsi
    xorl    %eax, %eax
    leaq    .LC5(%rip), %rdi
    call    printf@PLT
    pxor    %xmm0, %xmm0
    pxor    %xmm3, %xmm3
    leaq    .LC6(%rip), %rdi
    cvtsi2sdq   %rbp, %xmm3
    movl    $1, %eax
    cvtsi2sdq   %rbx, %xmm0
    divsd   %xmm3, %xmm0
    call    printf@PLT
    movsd   24(%rsp), %xmm2
    movsd   8(%rsp), %xmm1
    leaq    .LC7(%rip), %rdi
    movl    $1, %eax
    subsd   %xmm1, %xmm2
    movapd  %xmm2, %xmm0
    call    printf@PLT
    addq    $40, %rsp
    .cfi_def_cfa_offset 40
    xorl    %eax, %eax
    popq    %rbx
    .cfi_def_cfa_offset 32
    popq    %rbp
    .cfi_def_cfa_offset 24
    popq    %r12
    .cfi_def_cfa_offset 16
    popq    %r13
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE11:
    .size   main, .-main
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long   0
    .long   1072693248
    .align 8
.LC2:
    .long   -74511709
    .long   1072918296
    .align 8
.LC4:
    .long   644960408
    .long   1072322682
    .ident  "GCC: (GNU) 12.2.0"
    .section    .note.GNU-stack,"",@progbits

It's clear divsd is not being optimized out? What am I doing wrong?

Peter Cordes
  • 328,167
  • 45
  • 605
  • 847
mas
  • 1,155
  • 1
  • 11
  • 28
  • 1
    What does the processor's programming manual say on both instructions `divsd` and `mulsd` about needed clocks? Do they depend on values, for example? – the busybee Apr 04 '22 at 14:48
  • 1
    I have a (lscpu out): Model name: Intel(R) Core(TM) i5-10400 CPU @ 2.90GHz How do I look that up? Not a greybeard here. – mas Apr 04 '22 at 14:51
  • `Someone told me the factor of speed differences is about 40x.` When & where? – जलजनक Apr 04 '22 at 15:01
  • At work, the internet seems to say about 13x. – mas Apr 04 '22 at 15:08
  • SO seems to say 24x: https://stackoverflow.com/a/4125074/9691276 – mas Apr 04 '22 at 15:08
  • 2
    In the div case, the value of x is 0 for most of the test. It's likely that dividing 0 by something is faster than a general div operation (just as it is for humans). – prl Apr 04 '22 at 15:59
  • 1
    BTW [here](http://users.atw.hu/instlatx64/GenuineIntel/GenuineIntel00906EA_Coffeelake_InstLatX64.txt) are the instruction timings for the kind of processor you're using, so you can base predictions off of that rather than arbitrary numbers. However, based on those numbers, we still have a mystery - `mulsd` and `divsd` are not *that* close on your processor. – harold Apr 04 '22 at 16:06
  • 4
    @harold it seems @prl is correct, when I change y to 0.9999999999999 or some expansion thereof I see a bit more of a difference: ``` div: 173299 mul: 49747 div / mul: 3.483607 ``` – mas Apr 04 '22 at 16:11
  • Wild guess here. There is a read after write dependency (write xmm0 then read xmm0) in the code that could prevent the MUL instruction to go as fast as it could. Even taking into account bypassing this could still be inserting bubbles in the pipeline and slow down the MUL more than the DIV. – Fra93 Apr 04 '22 at 16:18
  • 1
    I think you found a *very strange effect* of Intel (~Skylake/Icelake) processors! With values like in the range [1.0;2.0], both the div and mul operations are much much slower than what they should resulting in similar timings. Outside this range, the div is significantly slower (as expected though it is still a bit fast). It looks like a hardware bug... – Jérôme Richard Apr 04 '22 at 16:27
  • Greybeard coworker: from what I could glean from an internet search, modern processors use a digit recurrence algorithm for division. It appears to have conditionals for convergence, so the values being divided matter. – mas Apr 04 '22 at 16:39
  • 2
    Multiplication is known to have some bad-performance edge case when a denormal float is multiplied by a normal one (and some other cases involving denormals) so it could be that (not a new thing, but still a weird thing). But, dividing zero by something is not a particularly fast case, at least not on your processor. The time division takes is somewhat value-dependent though. – harold Apr 04 '22 at 16:40
  • 2
    I checked on my Skylake i5-9600KF machine and I confirm the issue with y~1.214 is due to denormalized numbers! Using [this](https://stackoverflow.com/a/8217313/12939557) to truncate denormalized numbers results in the `div` code being 3-4 times slower than the mul which is expected (instead of ~1). For more information about denormalized numbers, please read https://stackoverflow.com/questions/54937154 and https://stackoverflow.com/questions/15140847 . That being said, I can also reproduce the strange speed up with y>2 and I still do not understand why this happens... – Jérôme Richard Apr 04 '22 at 18:54
  • 2
    re: relative performance of divsd vs. mulsd on modern Intel, for normalized inputs/outpus: [Floating point division vs floating point multiplication](https://stackoverflow.com/a/45899202). For throughput, if you don't do it often, mixing in one div can be as cheap as another mul. But with lots of divsd, a factor of 8 throughput, factor of ~4.5 latency (which you're measuring). But if you Also, re: lack of warm-up loop before first timed interval: [Idiomatic way of performance evaluation?](https://stackoverflow.com/q/60291987). – Peter Cordes Apr 04 '22 at 22:59
  • Older CPUs had worse ratios of div vs. mul throughput, and also worse latency. – Peter Cordes Apr 04 '22 at 23:02
  • 2
    My previous comment might have been confusing: `divsd` latency is 13-14 cycles, regardless of whether you're doing it back-to-back like here, or mixed in occasionally with other work. The divider unit being not fully pipelined matters only for throughput, being relevant when there are multiple dep chains, e.g. x1 /= y; x2 /= y; x3 /= y, etc. If you aren't trying to start another divide in the next cycle, it doesn't matter; a multiply can start the next cycle on the FMA unit that's also on port 0, and `divsd` is a single-uop instruction. – Peter Cordes Apr 05 '22 at 03:56
  • 1
    According to [uops.info](https://uops.info/html-instr/DIVSD_XMM_XMM.html) the latency of `divsd` is always `≤` some value. As @prl I assume for trivial quotients such as `0 / something` the CPU probably has a shortcut. I recommend resetting `x` every ~1000 iterations (for example) – chtz Apr 05 '22 at 11:11

0 Answers0