As of 2023, the best way to get compilers to generate good vector code is to iterate over chunks the size of your vector registers. On AVX2 or above, those would be 256 bits, or 32 bytes.
#include <omp.h>
#include <stdalign.h>
#include <stddef.h>
#include <stdint.h>
#define ALIGNMENT 16U
size_t countEqualBytes(const size_t n, const uint8_t a[n], const uint8_t b[n]) {
size_t sum = 0;
size_t i = 0;
if (n >= 32U) {
const size_t sentinel = n - 31U;
// #pragma omp parallel for reduction(+:sum) schedule(static)
for (i = 0; i < sentinel; i += 32U) {
sum += (size_t)((a[i] == b[i]) +
(a[i + 1] == b[i + 1]) +
(a[i + 2] == b[i + 2]) +
(a[i + 3] == b[i + 3]) +
(a[i + 4] == b[i + 4]) +
(a[i + 5] == b[i + 5]) +
(a[i + 6] == b[i + 6]) +
(a[i + 7] == b[i + 7]) +
(a[i + 8] == b[i + 8]) +
(a[i + 9] == b[i + 9]) +
(a[i + 10] == b[i + 10]) +
(a[i + 11] == b[i + 11]) +
(a[i + 12] == b[i + 12]) +
(a[i + 13] == b[i + 13]) +
(a[i + 14] == b[i + 14]) +
(a[i + 15] == b[i + 15]) +
(a[i + 16] == b[i + 16]) +
(a[i + 17] == b[i + 17]) +
(a[i + 18] == b[i + 18]) +
(a[i + 19] == b[i + 19]) +
(a[i + 20] == b[i + 20]) +
(a[i + 21] == b[i + 21]) +
(a[i + 22] == b[i + 22]) +
(a[i + 23] == b[i + 23]) +
(a[i + 24] == b[i + 24]) +
(a[i + 25] == b[i + 25]) +
(a[i + 26] == b[i + 26]) +
(a[i + 27] == b[i + 27]) +
(a[i + 28] == b[i + 28]) +
(a[i + 29] == b[i + 29]) +
(a[i + 30] == b[i + 30]) +
(a[i + 31] == b[i + 31]));
}
}
for (; i<n; i++) {
sum += (a[i] != b[i]);
}
return sum;
}
Clang 16 or ICX 2022 with -std=c17 -O3 -march=x86-64-v4
are able to compile the critical loop of this to:
.LBB0_5: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [rsi + r10]
vmovdqu ymm1, ymmword ptr [rsi + r10 + 32]
vmovdqu ymm2, ymmword ptr [rsi + r10 + 64]
vmovdqu ymm3, ymmword ptr [rsi + r10 + 96]
vpcmpeqb k0, ymm0, ymmword ptr [rdx + r10]
kmovd ebx, k0
popcnt ebx, ebx
add rbx, rax
vpcmpeqb k0, ymm1, ymmword ptr [rdx + r10 + 32]
kmovd eax, k0
popcnt eax, eax
add rax, rbx
vpcmpeqb k0, ymm2, ymmword ptr [rdx + r10 + 64]
kmovd ebx, k0
popcnt ebx, ebx
add rbx, rax
vpcmpeqb k0, ymm3, ymmword ptr [rdx + r10 + 96]
kmovd eax, k0
popcnt eax, eax
add rax, rbx
sub r10, -128
add r9, -4
jne .LBB0_5
Which is this, unrolled four times:
.LBB0_8: # =>This Inner Loop Header: Depth=1
vmovdqu ymm0, ymmword ptr [r10 + rbx]
vpcmpeqb k0, ymm0, ymmword ptr [r9 + rbx]
kmovd ecx, k0
popcnt ecx, ecx
add rax, rcx
add rbx, 32
cmp r8, rbx
jne .LBB0_8
Although this uses an AVX512VL instruction, ICX can also vectorize for AVX or AVX2.
If you want to multi-thread the function as well as vectorize, add -fiopenmp
on ICX/ICPX, or -fopenmp
on Clang/GCC, and uncomment the #pragma omp
directive. Unfortunately, this only accepts a rigid format for the for
statement, requiring a nested if
block around the for
(that otherwise could have been an extra clause in the loop condition: n > 31U && i < n - 31U
).
Since x96 CPUs load data faster into registers faster when it’s aligned on 16-byte boundaries, you also want to declare your input arrays alignas(ALIGNMENT)
.
This is as fast and as portable as I was able to get it. However, you should see this answer to a very similar question by @harold, which combines a 4,096-byte outer loop with a 32-byte inner loop, then a second loop to perform a vertical add. The inner loop is also shorter by one instruction.