Why does summing floats into an int temporary run so much slower than when everything is int?

Question

I compiled this program in GCC 4.8 -O2 on Skylake 6700HQ.

When I use float data type, total execution time is 0.000176 sec. When I change the float to int, the total time is 0.000026 (~7x faster). I don't know the reason for this difference. Related question: assembly output O3

I use this command in Geany IDE build command gcc -Wall -march=native -O2 -o "%e" "%f". I also tried -O3 and -Ofast, but those do not fix the problem.

I also read this question but there is too much differences between this float and int implementation. Since this float implementation is 7 times slower than the corresponding int implementation, this is not a duplicate question

#include <stdio.h>
#include <time.h>

float   a[32][32]  
    ,   t[32][32] 
    ,   c_result[32][32] 
    ,   c_tra[32][32] ;


int main()
{
    int w = 10000;
    int i, j, k, temp;
    struct timespec tStart, tEnd;
    double tTotal , tBest=10000;
    do{
        clock_gettime(CLOCK_MONOTONIC,&tStart);


         for( i = 0; i < 32; i++){
            for( j =0 ; j < 32; j++){
                temp=0;
                for( k = 0 ;k < 32; k++)    {
                    temp += a[i][k] * c_tra[j][k];
                }   
                c_result[i][j]= temp;
            }
        }

        clock_gettime(CLOCK_MONOTONIC,&tEnd);
        tTotal = (tEnd.tv_sec - tStart.tv_sec);
        tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
        if(tTotal<tBest)
            tBest=tTotal;

    }while(w--);
    printf(" The best time: %lf sec\n",tBest);

    return 0;
}

It is assembly out put for int data type:

.file   "floatMULm.c"
    .section    .rodata.str1.8,"aMS",@progbits,1
    .align 8
.LC2:
    .string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB24:
    .cfi_startproc
    subq    $40, %rsp
    .cfi_def_cfa_offset 48
    movl    $1, %edi
    movq    %rsp, %rsi
    call    clock_gettime
    xorl    %esi, %esi
.L2:
    xorl    %r8d, %r8d
    .p2align 4,,10
    .p2align 3
.L7:
    movq    %r8, %rdi
    xorl    %eax, %eax
    xorl    %ecx, %ecx
    salq    $5, %rdi
    .p2align 4,,10
    .p2align 3
.L5:
    movl    a(%rsi,%rax), %edx
    imull   c_tra(%rdi,%rax), %edx
    addq    $4, %rax
    addl    %edx, %ecx
    cmpq    $128, %rax
    jne .L5
    movl    %ecx, c_result(%rsi,%r8)
    addq    $4, %r8
    cmpq    $128, %r8
    jne .L7
    subq    $-128, %rsi
    cmpq    $4096, %rsi
    jne .L2
    leaq    16(%rsp), %rsi
    movl    $1, %edi
    call    clock_gettime
    movq    24(%rsp), %rax
    subq    8(%rsp), %rax
    movl    $32, %r8d
    movl    $32, %ecx
    movl    $10000, %edx
    movl    $.LC2, %esi
    movl    $1, %edi
    vcvtsi2sdq  %rax, %xmm1, %xmm1
    movq    16(%rsp), %rax
    subq    (%rsp), %rax
    vcvtsi2sdq  %rax, %xmm0, %xmm0
    movl    $1, %eax
    vdivsd  .LC1(%rip), %xmm1, %xmm1
    vaddsd  %xmm0, %xmm1, %xmm0
    vminsd  .LC0(%rip), %xmm0, %xmm0
    call    __printf_chk
    xorl    %eax, %eax
    addq    $40, %rsp
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   main, .-main
    .comm   c_tra,4096,32
    .comm   c_result,4096,32
    .comm   t,4096,32
    .comm   a,4096,32
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long   0
    .long   1086556160
    .align 8
.LC1:
    .long   0
    .long   1104006501
    .ident  "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
    .section    .note.GNU-stack,"",@progbits

And this is for float :

        .file   "floatMULm.c"
    .section    .rodata.str1.8,"aMS",@progbits,1
    .align 8
.LC2:
    .string " The best time: %lf sec in %d repetition for %dX%d matrix\n"
    .section    .text.startup,"ax",@progbits
    .p2align 4,,15
    .globl  main
    .type   main, @function
main:
.LFB24:
    .cfi_startproc
    subq    $40, %rsp
    .cfi_def_cfa_offset 48
    movl    $1, %edi
    movq    %rsp, %rsi
    call    clock_gettime
    xorl    %ecx, %ecx
.L2:
    xorl    %edi, %edi
    .p2align 4,,10
    .p2align 3
.L7:
    movq    %rdi, %rsi
    xorl    %eax, %eax
    xorl    %edx, %edx
    salq    $5, %rsi
    .p2align 4,,10
    .p2align 3
.L5:
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  a(%rcx,%rax), %xmm2
    vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
    addq    $4, %rax
    vcvttss2si  %xmm0, %edx
    cmpq    $128, %rax
    jne .L5
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  %xmm0, c_result(%rcx,%rdi)
    addq    $4, %rdi
    cmpq    $128, %rdi
    jne .L7
    subq    $-128, %rcx
    cmpq    $4096, %rcx
    jne .L2
    leaq    16(%rsp), %rsi
    movl    $1, %edi
    call    clock_gettime
    movq    24(%rsp), %rax
    subq    8(%rsp), %rax
    movl    $32, %r8d
    movl    $32, %ecx
    movl    $10000, %edx
    movl    $.LC2, %esi
    movl    $1, %edi
    vcvtsi2sdq  %rax, %xmm1, %xmm1
    movq    16(%rsp), %rax
    subq    (%rsp), %rax
    vcvtsi2sdq  %rax, %xmm0, %xmm0
    movl    $1, %eax
    vdivsd  .LC1(%rip), %xmm1, %xmm1
    vaddsd  %xmm0, %xmm1, %xmm0
    vminsd  .LC0(%rip), %xmm0, %xmm0
    call    __printf_chk
    xorl    %eax, %eax
    addq    $40, %rsp
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE24:
    .size   main, .-main
    .comm   c_tra,4096,32
    .comm   c_result,4096,32
    .comm   t,4096,32
    .comm   a,4096,32
    .section    .rodata.cst8,"aM",@progbits,8
    .align 8
.LC0:
    .long   0
    .long   1086556160
    .align 8
.LC1:
    .long   0
    .long   1104006501
    .ident  "GCC: (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4"
    .section    .note.GNU-stack,"",@progbits

It is well known that floating point operations are slower than integer ones. — Dr. Snoopy, May 06 '16 at 08:39
Arithmetic operations on `float`, although surprisingly quick given what goes on under the hood, are slower than operations on `int`. — Bathsheba, May 06 '16 at 08:39
I know that but the differences is a bite not 7 time slower, I implemented this with `AVX` and `AVX2` the times are almost the same as `0.000005` sec. — ADMS, May 06 '16 at 08:42
Are you compiling with `-march=native`? [Demo](https://godbolt.org/g/OVbg2x) — Kerrek SB, May 06 '16 at 08:54
The whole inner loops sgould be optimized away because you are only ever adding and multiplying zeros. I can only guess that the compiler assumes that `clock_gettime` may alter `a` and `c_tra` so that it won't optimize. But that relies on assumptions, so you should first improve the testcase, AFAICS. — Johannes Schaub - litb, May 06 '16 at 08:57
Could you post the assembly your compiler generates for both the integer and float versions? — EOF, May 06 '16 at 08:57
What happens if you change `main()`s `int temp` to `float temp` in the floating-point version? — EOF, May 06 '16 at 10:31
Good point, Now it works! It was because of the conversion between `float` to `int` data type in the inner loop. — ADMS, May 06 '16 at 10:37
Yeah, looking at http://www.agner.org/optimize/instruction_tables.pdf, `CVTSI2SS` and `CVT(T)SS2SI` each have 6 cycles latency. Since they are in the dependency chain in your inner loop, that hurts plenty. Of course the next question would be why the loop is not vectorized in either the integer or float version, even with `-Ofast`. — EOF, May 06 '16 at 10:39

score 4 · Accepted Answer · answered May 06 '16 at 10:46

4

The problem is the inner loop of the floating-point version:

.L5:
    vcvtsi2ss   %edx, %xmm0, %xmm0
    vmovss  a(%rcx,%rax), %xmm2
    vfmadd231ss c_tra(%rsi,%rax), %xmm2, %xmm0
    addq    $4, %rax
    vcvttss2si  %xmm0, %edx
    cmpq    $128, %rax
    jne .L5

Because temp in main() is of type int (corresponding to %edx in the assembly), the value has to be converted back and forth between float and int in the loop. According to http://www.agner.org/optimize/instruction_tables.pdf, CVTSI2SS and CVT(T)SS2SI each have 6 cycles latency on Skylake. Furthermore, the conversions are in the dependency-chain, so out-of-order and superscalar execution do not help much in this case.

Changing main()s int temp to float temp removes these conversions.

answered May 06 '16 at 10:46

EOF

6,273
2
26
50

With `-ffast-math`, gcc might be able to replace the truncation to `int` with a `floor` (using SSE4.1 `roundps` if available). Of course, a `float` can hold values greater than `INT_MAX`... Turns out [even `-ffast-math` doesn't get rid of the round trip](https://godbolt.org/g/egKCEI) with gcc 6.1 or clang 3.8. With `float` and `-ffast-math`, it gets auto-vectorized for what should be a massive speedup. – Peter Cordes May 06 '16 at 17:55
Anyway, the OP could maybe use `min(temp, (float)INT_MAX)` to range-limit. (Note that converting to integer doesn't saturate to INT_MAX, you actually get undefined results. IIRC, x86's `cvttss2si` gives you the "integer indefinite" value for floats that are out of range, which is just the sign-bit set, i.e. INT_MIN. – Peter Cordes May 06 '16 at 17:57
@PeterCordes It is already using a truncation: `vcvttss2si` rather than `vcvtss2si`. – EOF May 06 '16 at 17:57
Yeah, but if the OP wanted the `floor` behaviour of the round trip to `int`, without *actually* converting to `int`, an FP rounding instruction would be way faster. e.g. use `floor`, so the compiler can implement it with `roundps` with the imm8 rounding mode set appropriately. e.g. [this version auto-vectorizes](https://godbolt.org/g/c0gsFH), doing `temp += floor( ... * ... )`. (Don't do `temp = floor(temp)`, that defeats auto-vectorization, probably because the input to `floor()` is the partial sum, which auto-vectorization computes in a different order.) – Peter Cordes May 06 '16 at 18:00
1

@Ah. Interesting point. The compiler is probably not allowed to actually do this on its own, so you'd need a `truncf()`. Anyway, it seems the OP is fine with using a `float temp` and keeping the fractional part. Anyway, yes, this actually vectorizes fine for me, as well. The one part I *don't* understand is why this is flagged as duplicate of a completely different (more abstract) question. – EOF May 06 '16 at 18:05
I think we're saying the same thing. `floorf` and `truncf` are very similar, and both can be implemented with different choices for `vroundps`'s immediate operand. That's the point of `vroundps`: it can round towards +/-Inf (floorf), or 0 (truncf), or nearest(nearbyintf), without changing the MXCSR rounding mode. Look at the link in my previous comment: a well-placed `floor()` auto-vectorizes, but using it on accumulator doesn't. (because a different order of summation means the value being rounded is different. This could matter if an intermediate gets so big that rounding error is > 1.0) – Peter Cordes May 06 '16 at 18:12
I agree, it's not actually a duplicate, since this question is asking about mixing float and int vs using pure float. So the answer is conversion overhead, not that FP math is much higher latency / slightly lower throughput. Wait, actually that linked question is about `sumi += rand() % 365` vs. `sumf += (float)(rand() % 365)`. Of course, the OP there doesn't have a useful benchmark, because it bottlenecks on rand and integer modulo! So, not the same after all. – Peter Cordes May 06 '16 at 18:14
Anyway, chalk this one up to C89-style variable declarations at the top of the function, instead of declaring temporaries as they're initialized. Bad style -> hidden bugs, of both the performance and correctness variety in this case. – Peter Cordes May 06 '16 at 18:21
4

Yes, I find this question to actually be better than the question it's supposedly a duplicate of. Also, the upvoted comments here proudly proclaiming their ignorance about modern x86 cpus are a bit much. – EOF May 06 '16 at 18:21

Why does summing floats into an int temporary run so much slower than when everything is int?

1 Answers1