When running the attached sample program, the function tan
appears to be twice as slow in context as opposed to when it is isolated. This is the output on my machine:
justtan(): ~16.062430 ns/iter
notan(): ~30.852820 ns/iter
withtan(): ~60.703100 ns/iter
empty(): ~0.355270 ns/iter
I would expect withtan()
to be ~45ns or lower, given that it is a combination of justtan
and notan
.
I'm running macOS 11.5.2 with an Intel i7-4980HQ CPU. My cc --version
is Apple clang version 13.0.0 (clang-1300.0.29.3)
. I've checked to make sure that disassembly for withtan
and notan
is identical except for the call to tan
, and that clang is autovectorizing the loops with VEX instructions. I've also checked via a debugger that the version of tan
which is called at runtime also utilizes VEX instructions to avoid the SSE-AVX2 transition penalty.
I compiled and ran the program in a Linux VM, and got a similar result (in the debugger, tan also uses AVX/VEX). Additionally, I ran it through cachegrind and found there were essentially no L1 cache misses (0.00%) for any of the functions, however when running through cachegrind all the times correctly add up.
This is how I'm running the executable:
cc -Wall -O3 -mavx2 -o main main.c && ./main
Here is main.c
:
#include <stdint.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
// ---------------------------------------------------------------------
// -------------------- benchmarking harness ---------------------------
int64_t ITERS = 100000000;
double black_box(double x) {
asm("" : : "r"(&x) : "memory");
return x;
}
uint64_t nanosec() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000ull + ts.tv_nsec;
}
double bench(double (*f)()) {
// Warmup
for (int i = 0; i < ITERS / 10; i++) {
black_box(f());
}
uint64_t start = nanosec();
for (int i = 0; i < ITERS; i++) {
black_box(f());
}
uint64_t end = nanosec();
return (double)(end - start) / (double)ITERS;
}
// -------------------- end benchmarking harness -----------------------
// ---------------------------------------------------------------------
#define LEN 32
#define SUM_LEN 24
double VALS[LEN] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
__attribute__ ((noinline))
double sum24(double* ptr) {
double sum = 0.;
for (int i = 0; i < 24; i++) {
sum += ptr[i];
}
return sum;
}
__attribute__ ((noinline))
double withtan() {
double a = sum24(VALS);
double b = sum24(VALS + 1);
double c = sum24(VALS + 2);
double d = sum24(VALS + 3);
return tan(a + b + c + d);
}
__attribute__ ((noinline))
double notan() {
double a = sum24(VALS);
double b = sum24(VALS + 1);
double c = sum24(VALS + 2);
double d = sum24(VALS + 3);
return a + b + c + d;
}
__attribute__ ((noinline))
double justtan() {
return tan(black_box(96));
}
__attribute__ ((noinline))
double empty() {
return 1.;
}
int main() {
printf("justtan(): ~%f ns/iter\n", bench(justtan));
printf("notan(): ~%f ns/iter\n", bench(notan));
printf("withtan(): ~%f ns/iter\n", bench(withtan));
printf("empty(): ~%f ns/iter\n", bench(empty));
}
Why is tan
slower in context than when isolated?