To avoid UB I rewrote the alpha
function:
void alpha(){
int i=0;
while (i<100)
{
a[i] += b[i];
i++;
a[i] += b[i];
i++;
a[i] += b[i];
i++;
a[i] += b[i];
i++;
}
}
and the generated code depends on the platform:
For x86 it exactly the same.
beta:
xor eax, eax
.L2:
movdqa xmm0, XMMWORD PTR a[rax]
paddd xmm0, XMMWORD PTR b[rax]
add rax, 16
movaps XMMWORD PTR a[rax-16], xmm0
cmp rax, 400
jne .L2
ret
alpha:
xor eax, eax
.L6:
movdqa xmm0, XMMWORD PTR a[rax]
paddd xmm0, XMMWORD PTR b[rax]
add rax, 16
movaps XMMWORD PTR a[rax-16], xmm0
cmp rax, 400
jne .L6
ret
b:
.zero 400
a:
.zero 400
but if we consider ARM Cortex the alpha will execute faster.
beta:
ldr r3, .L6
ldr r1, .L6+4
add ip, r3, #400
.L2:
ldr r2, [r3, #4]!
ldr r0, [r1, #4]!
cmp r3, ip
add r2, r2, r0
str r2, [r3]
bne .L2
bx lr
.L6:
.word a-4
.word b-4
alpha:
ldr r3, .L13
ldr r2, .L13+4
push {r4, r5, r6, r7, r8, lr}
add r7, r3, #400
.L9:
ldr lr, [r3]
ldr ip, [r3, #4]
ldr r0, [r3, #8]
ldr r1, [r3, #12]
ldr r8, [r2]
ldr r6, [r2, #4]
ldr r5, [r2, #8]
ldr r4, [r2, #12]
add lr, lr, r8
add ip, ip, r6
add r0, r0, r5
add r1, r1, r4
str lr, [r3]
str ip, [r3, #4]
str r0, [r3, #8]
str r1, [r3, #12]
add r3, r3, #16
cmp r3, r7
add r2, r2, #16
bne .L9
pop {r4, r5, r6, r7, r8, pc}
.L13:
.word a
.word b
So the general answer is: always benchmark the code
https://godbolt.org/z/sWjqE1