I did this expriment https://en.algorithmica.org/hpc/pipelining/branching/
which is inspired by the most upvoted Stack Overflow question ever.
Why is processing a sorted array faster than processing an unsorted array?
however i got a symmetrical result on Apple M1
chip:
(while the experiment use AMD Zen2)
the code is
#include <bits/stdc++.h>
#ifndef N
#define N 1'000'000
#endif
#ifndef T
#define T 1e8
#endif
#ifndef P
#define P 50
#endif
const int K = T / N;
int a[N];
int main()
{
for (int i = 0; i < N; i++)
a[i] = rand() % 100;
#ifdef SORT
std::sort(a, a + N);
#endif
clock_t start = clock();
volatile int s = 0;
for (int k = 0; k < K; k++)
for (int i = 0; i < N; i++)
#ifdef CMOV
s += (a[i] < P ? a[i] : 0);
#else
// if (__builtin_expect(a[i] < P, false))// [[unlikely]]
if (a[i] < P)
s += a[i];
#endif
float seconds = float(clock() - start) / CLOCKS_PER_SEC;
float per_element = 1e9 * seconds / K / N;
printf("%.4f %.4f %.4f\n", seconds, per_element, 2 * per_element);
printf("%d\n", s);
return 0;
}
and the plot code with ipython:
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
sns.reset_defaults()
sns.set_theme(style='whitegrid')
# benchmark
def bench(n=10**6, p=50, t=10**8, sort=False, cmov=False, unroll=False, cc='clang++'):
res = !{cc} -std=c++17 -O3 -D N={n} -D T={t} -D P={p} {"-D CMOV" if cmov else ""} {"-D SORT" if sort else ""} {"-funroll-loops" if unroll else ""} branching.cc -o run && ./run
print(res)
return float(res[0].split()[-1])
ps = list(range(0, 101))
rs = [bench(p=p) for p in ps]
# plot
with open('ps.pkl', 'wb') as file:
pickle.dump(rs, file)
with open('ps.pkl', 'rb') as file:
rs = pickle.load(file)
plt.plot(ps, rs, c='darkred')
plt.xlabel('Branch probability (P)')
plt.ylabel('Cycles per iteration')
plt.title('for (int i = 0; i < N; i++) if (a[i] < P) s += a[i]', pad=12)
plt.ylim(bottom=0)
plt.margins(0)
fig = plt.gcf()
fig.savefig('branchy-vs-branchless.svg')
plt.show()
if the condition is true ,there would be more instructions for add to s
so there should have a asymmetrical result.
why?
i tried to change !{cc} -std=c++17 -O3
to -O2
-O1
and -O0
but the result is still symmetrical
Add branchless experiment results: