-4

Which would you expect to be faster? (Assume that arrays a[100] and b[100] are initialized globals)

void beta(){

    int i;
    for (i=0;i<100;i++){
    a[i] = a[i] + b[i];
    }

}

void alpha(){

    int i=0;
    while (i<100){
    a[i] += b[i++];
    a[i] += b[i++];
    a[i] += b[i++];
    a[i] += b[i++];
    }

}
MikeCAT
  • 73,922
  • 11
  • 45
  • 70
  • 6
    `alpha` has undefined behavior, so I'd go with `beta`. See [this question](https://stackoverflow.com/questions/949433/why-are-these-constructs-using-pre-and-post-increment-undefined-behavior) for more info. – user3386109 Oct 30 '20 at 21:57
  • 1
    Most likely both the same since compilers nowadays are really good at optimisation. To where it could even insert its own loop through repetative operations. – Irelia Oct 30 '20 at 21:58

1 Answers1

1

To avoid UB I rewrote the alpha function:

void alpha(){

    int i=0;
    while (i<100)
    {
        a[i] += b[i];
        i++;
        a[i] += b[i];
        i++;
        a[i] += b[i];
        i++;
        a[i] += b[i];
        i++;
    }
}

and the generated code depends on the platform:

For x86 it exactly the same.

beta:
        xor     eax, eax
.L2:
        movdqa  xmm0, XMMWORD PTR a[rax]
        paddd   xmm0, XMMWORD PTR b[rax]
        add     rax, 16
        movaps  XMMWORD PTR a[rax-16], xmm0
        cmp     rax, 400
        jne     .L2
        ret
alpha:
        xor     eax, eax
.L6:
        movdqa  xmm0, XMMWORD PTR a[rax]
        paddd   xmm0, XMMWORD PTR b[rax]
        add     rax, 16
        movaps  XMMWORD PTR a[rax-16], xmm0
        cmp     rax, 400
        jne     .L6
        ret
b:
        .zero   400
a:
        .zero   400

but if we consider ARM Cortex the alpha will execute faster.

beta:
        ldr     r3, .L6
        ldr     r1, .L6+4
        add     ip, r3, #400
.L2:
        ldr     r2, [r3, #4]!
        ldr     r0, [r1, #4]!
        cmp     r3, ip
        add     r2, r2, r0
        str     r2, [r3]
        bne     .L2
        bx      lr
.L6:
        .word   a-4
        .word   b-4
alpha:
        ldr     r3, .L13
        ldr     r2, .L13+4
        push    {r4, r5, r6, r7, r8, lr}
        add     r7, r3, #400
.L9:
        ldr     lr, [r3]
        ldr     ip, [r3, #4]
        ldr     r0, [r3, #8]
        ldr     r1, [r3, #12]
        ldr     r8, [r2]
        ldr     r6, [r2, #4]
        ldr     r5, [r2, #8]
        ldr     r4, [r2, #12]
        add     lr, lr, r8
        add     ip, ip, r6
        add     r0, r0, r5
        add     r1, r1, r4
        str     lr, [r3]
        str     ip, [r3, #4]
        str     r0, [r3, #8]
        str     r1, [r3, #12]
        add     r3, r3, #16
        cmp     r3, r7
        add     r2, r2, #16
        bne     .L9
        pop     {r4, r5, r6, r7, r8, pc}
.L13:
        .word   a
        .word   b

So the general answer is: always benchmark the code

https://godbolt.org/z/sWjqE1

0___________
  • 60,014
  • 4
  • 34
  • 74