I know different compilers generate different x86asm for different machines and same x86asm require a different amount of clock cycles to run on different machines. However, what are general guidelines?
For example, I currently have the following code where I want to use "abstraction at no overhead", but I am unable to verify whether foo (abstration over bar) is actually slower and how much than bar.
foo(ComputeArray<double, 4ul>&, double):
mov rax, QWORD PTR [rdi]
movapd xmm1, xmm0
addsd xmm0, xmm0
unpcklpd xmm1, xmm1
mov rax, QWORD PTR [rax]
unpcklpd xmm0, xmm0
movupd xmm3, XMMWORD PTR [rax+224]
movupd xmm2, XMMWORD PTR [rax+240]
movupd xmm4, XMMWORD PTR [rax+272]
mulpd xmm3, xmm1
mulpd xmm2, xmm1
movupd xmm1, XMMWORD PTR [rax+256]
mulpd xmm1, xmm0
mulpd xmm0, xmm4
addpd xmm1, xmm3
addpd xmm0, xmm2
movups XMMWORD PTR [rax+192], xmm1
movups XMMWORD PTR [rax+208], xmm0
ret
bar(double (*) [4], double):
movapd xmm2, xmm0
movupd xmm1, XMMWORD PTR [rdi+256]
movupd xmm0, XMMWORD PTR [rdi+272]
movupd xmm4, XMMWORD PTR [rdi+224]
movupd xmm6, XMMWORD PTR [rdi+240]
unpcklpd xmm2, xmm2
addpd xmm1, xmm1
addpd xmm0, xmm0
addpd xmm1, xmm4
addpd xmm0, xmm6
mulpd xmm1, xmm2
mulpd xmm0, xmm2
movups XMMWORD PTR [rdi+192], xmm1
movups XMMWORD PTR [rdi+208], xmm0
ret