While benchmarking two functions I noticed that the execution times do not make sense. In one function I am returning the argument, while calculating the sqrt of the argument in the other.
Isolated Source:
#include <math.h>
#include <stdio.h>
#include <time.h>
float impl(unsigned long n) { return n; }
float impl_sqrt(unsigned long n) { return sqrt(n); }
void bench(float (*fun)(unsigned long), size_t iter, unsigned long n) {
struct timespec begin, end;
clock_gettime(CLOCK_MONOTONIC, &begin);
for (size_t i = 0; i < iter; i++) {
(*fun)(n);
}
clock_gettime(CLOCK_MONOTONIC, &end);
double tdiff =
(end.tv_sec - begin.tv_sec) + 1e-9 * (end.tv_nsec - begin.tv_nsec);
printf("Time %f seconds\n", tdiff);
}
int main(void) {
const size_t iter = 100000000;
fputs("Constant\n", stderr);
bench(&impl, iter, 271);
fputs("Sqrt\n", stderr);
bench(&impl_sqrt, iter, 271);
}
Compiler: Apple clang version 14.0.3 (clang-1403.0.22.14.1) Target: arm64-apple-darwin22.1.0 Thread model: posix
aarch64 Apple M1 Pro
On macOS with clang -O0
$ ./a.out
Constant
Time 0.216895 seconds
Sqrt
Time 0.216063 seconds
On macOS with clang -O0 -fno-builtin-sqrt
$ ./a.out
Constant
Time 0.216224 seconds
Sqrt
Time 0.253717 seconds
How is it possible that returning a constant is slower than calculating its sqrt? Looking at the disassembled source I cannot see any optimisation done by the compiler that would justify the performance difference.
Executing the program with time stopping yields unexpected results. How is this optimisation done, and where is it done?
Here parts of the disassembled binary from ghidra:
int _bench(code *param_1,ulong param_2,undefined8 param_3)
{
int iVar1;
ulong local_50;
long local_48;
long local_40;
long local_38;
long local_30;
undefined8 local_28;
ulong local_20;
code *local_18;
local_28 = param_3;
local_20 = param_2;
local_18 = param_1;
_clock_gettime(6,&local_38);
for (local_50 = 0; local_50 < local_20; local_50 = local_50 + 1) {
(*local_18)(local_28);
}
_clock_gettime(6,&local_48);
NEON_fmadd(0x3e112e0be826d695,(double)(local_40 - local_30),(double)(local_48 - local_38));
iVar1 = _printf("Time %f seconds\n");
return iVar1;
}
**************************************************************
* FUNCTION *
**************************************************************
undefined _bench()
undefined w0:1 <RETURN>
undefined8 Stack[-0x10]:8 local_10 XREF[2]: 100003e08(W),
100003ebc(*)
undefined8 Stack[-0x18]:8 local_18 XREF[2]: 100003e10(W),
100003e48(R)
undefined8 Stack[-0x20]:8 local_20 XREF[2]: 100003e14(W),
100003e34(R)
undefined8 Stack[-0x28]:8 local_28 XREF[2]: 100003e18(W),
100003e4c(R)
undefined8 Stack[-0x30]:8 local_30 XREF[1]: 100003e88(R)
undefined8 Stack[-0x38]:8 local_38 XREF[1]: 100003e78(R)
undefined8 Stack[-0x40]:8 local_40 XREF[1]: 100003e84(R)
undefined8 Stack[-0x48]:8 local_48 XREF[1]: 100003e74(R)
undefined8 Stack[-0x50]:8 local_50 XREF[4]: 100003e28(W),
100003e30(R),
100003e58(R),
100003e60(W)
undefined8 Stack[-0x58]:8 local_58 XREF[2]: 100003ea0(W),
100003ea4(R)
undefined8 Stack[-0x60]:8 local_60 XREF[1]: 100003eac(W)
_bench XREF[3]: Entry Point(*),
entry:100003f14(c),
entry:100003f3c(c)
100003e04 ff 83 01 d1 sub sp,sp,#0x60
100003e08 fd 7b 05 a9 stp x29,x30,[sp, #local_10]
100003e0c fd 43 01 91 add x29,sp,#0x50
100003e10 a0 83 1f f8 stur x0,[x29, #local_18]
100003e14 a1 03 1f f8 stur x1,[x29, #local_20]
100003e18 a2 83 1e f8 stur x2,[x29, #local_28]
100003e1c c0 00 80 52 mov w0,#0x6
100003e20 e1 a3 00 91 add x1,sp,#0x28
100003e24 4b 00 00 94 bl _clock_gettime undefined _clock_gettime()
100003e28 ff 0b 00 f9 str xzr,[sp, #local_50]
100003e2c 01 00 00 14 b LAB_100003e30
LAB_100003e30 XREF[2]: 100003e2c(j), 100003e64(j)
100003e30 e8 0b 40 f9 ldr x8,[sp, #local_50]
100003e34 a9 03 5f f8 ldur x9,[x29, #local_20]
100003e38 08 01 09 eb subs x8,x8,x9
100003e3c e8 37 9f 1a cset w8,cs
100003e40 48 01 00 37 tbnz w8,#0x0,LAB_100003e68
100003e44 01 00 00 14 b LAB_100003e48
LAB_100003e48 XREF[1]: 100003e44(j)
100003e48 a8 83 5f f8 ldur x8,[x29, #local_18]
100003e4c a0 83 5e f8 ldur x0,[x29, #local_28]
100003e50 00 01 3f d6 blr x8
100003e54 01 00 00 14 b LAB_100003e58
LAB_100003e58 XREF[1]: 100003e54(j)
100003e58 e8 0b 40 f9 ldr x8,[sp, #local_50]
100003e5c 08 05 00 91 add x8,x8,#0x1
100003e60 e8 0b 00 f9 str x8,[sp, #local_50]
100003e64 f3 ff ff 17 b LAB_100003e30
LAB_100003e68 XREF[1]: 100003e40(j)
100003e68 c0 00 80 52 mov w0,#0x6
100003e6c e1 63 00 91 add x1,sp,#0x18
100003e70 38 00 00 94 bl _clock_gettime undefined _clock_gettime()
100003e74 e8 0f 40 f9 ldr x8,[sp, #local_48]
100003e78 e9 17 40 f9 ldr x9,[sp, #local_38]
100003e7c 08 01 09 eb subs x8,x8,x9
100003e80 02 01 62 9e scvtf d2,x8
100003e84 e8 13 40 f9 ldr x8,[sp, #local_40]
100003e88 e9 1b 40 f9 ldr x9,[sp, #local_30]
100003e8c 08 01 09 eb subs x8,x8,x9
100003e90 01 01 62 9e scvtf d1,x8
100003e94 08 00 00 90 adrp x8,0x100003000
100003e98 00 c1 47 fd ldr d0,[x8, #0xf80]=>DAT_100003f80 = 3E112E0BE826D695h
100003e9c 00 08 41 1f fmadd d0,d0,d1,d2
100003ea0 e0 07 00 fd str d0,[sp, #local_58]
100003ea4 e0 07 40 fd ldr d0,[sp, #local_58]
100003ea8 e8 03 00 91 mov x8,sp
100003eac 00 01 00 fd str d0,[x8]=>local_60
100003eb0 00 00 00 90 adrp x0,0x100003000
100003eb4 00 20 3e 91 add x0=>s_Time_%f_seconds_100003f88,x0,#0xf88 = "Time %f seconds\n"
100003eb8 2c 00 00 94 bl _printf int _printf(char * param_1, ...)
100003ebc fd 7b 45 a9 ldp x29=>local_10,x30,[sp, #0x50]
100003ec0 ff 83 01 91 add sp,sp,#0x60
100003ec4 c0 03 5f d6 ret
float _impl(ulong param_1)
{
return (float)(unkuint9)param_1;
}
//
// __text
// __TEXT
// ram:100003dbc-ram:100003f4f
//
**************************************************************
* FUNCTION *
**************************************************************
undefined _impl()
undefined w0:1 <RETURN>
undefined8 Stack[-0x8]:8 local_8 XREF[2]: 100003dc0(W),
100003dc4(R)
_impl XREF[2]: Entry Point(*),
entry:100003f08(*)
100003dbc ff 43 00 d1 sub sp,sp,#0x10
100003dc0 e0 07 00 f9 str x0,[sp, #local_8]
100003dc4 e0 07 40 fd ldr d0,[sp, #local_8]
100003dc8 08 00 66 9e fmov x8,d0
100003dcc 00 01 23 9e ucvtf s0,x8
100003dd0 ff 43 00 91 add sp,sp,#0x10
100003dd4 c0 03 5f d6 ret
float _impl_sqrt(undefined8 param_1)
{
double dVar1;
dVar1 = (double)NEON_ucvtf(param_1);
return (float)SQRT(dVar1);
}
**************************************************************
* FUNCTION *
**************************************************************
undefined _impl_sqrt()
undefined w0:1 <RETURN>
undefined8 Stack[-0x8]:8 local_8 XREF[2]: 100003df4(W),
100003df8(R)
_impl_sqrt XREF[2]: Entry Point(*),
entry:100003f44(*)
100003df0 ff 43 00 d1 sub sp,sp,#0x10
100003df4 e0 07 00 f9 str x0,[sp, #local_8]
100003df8 e0 07 40 fd ldr d0,[sp, #local_8]
100003dfc 00 d8 61 7e ucvtf d0,d0
100003e00 00 c0 61 1e fsqrt d0,d0
100003e04 00 40 62 1e fcvt s0,d0
100003e08 ff 43 00 91 add sp,sp,#0x10
100003e0c c0 03 5f d6 ret
undefined8 entry(void)
{
undefined *puVar1;
puVar1 = PTR____stderrp_100004000;
_fputs("Constant\n",*(FILE **)PTR____stderrp_100004000);
_bench(_impl,100000000);
_fputs("Sqrt\n",*(FILE **)puVar1);
_bench(_impl_sqrt,100000000,0x10f);
return 0;
}