For such a function, clang
(and sometimes gcc
in certain contexts that I cannot reproduce minimally) seems to generate bloated code when -mavx2
switch is on.
unsigned count(uint64_t *f) {
unsigned c = 0;
for (unsigned i = 0; i < 1024; ++i) {
if (sizeof(long) >= 8) {
c += __builtin_popcountl(f[i]);
} else {
c += __builtin_popcountll(f[i]);
}
}
return c;
}
This is from gcc
and it's quite straightforward.
count:
lea rcx, [rdi+8192]
xor eax, eax
.L2:
xor edx, edx
add rdi, 8
popcnt rdx, QWORD PTR [rdi-8]
add eax, edx
cmp rcx, rdi
jne .L2
ret
However clang
decides to generate this massive bloat when -mavx2
is on. -mpopcnt
was also set.
.LCPI0_0:
.zero 32,15
.LCPI0_1:
.byte 0 # 0x0
.byte 1 # 0x1
.byte 1 # 0x1
.byte 2 # 0x2
.byte 1 # 0x1
.byte 2 # 0x2
.byte 2 # 0x2
.byte 3 # 0x3
.byte 1 # 0x1
.byte 2 # 0x2
.byte 2 # 0x2
.byte 3 # 0x3
.byte 2 # 0x2
.byte 3 # 0x3
.byte 3 # 0x3
.byte 4 # 0x4
.byte 0 # 0x0
.byte 1 # 0x1
.byte 1 # 0x1
.byte 2 # 0x2
.byte 1 # 0x1
.byte 2 # 0x2
.byte 2 # 0x2
.byte 3 # 0x3
.byte 1 # 0x1
.byte 2 # 0x2
.byte 2 # 0x2
.byte 3 # 0x3
.byte 2 # 0x2
.byte 3 # 0x3
.byte 3 # 0x3
.byte 4 # 0x4
count: # @count
vpxor xmm0, xmm0, xmm0
xor eax, eax
vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
vmovdqa ymm2, ymmword ptr [rip + .LCPI0_1] # ymm2 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
vpxor xmm12, xmm12, xmm12
vpxor xmm4, xmm4, xmm4
vpxor xmm5, xmm5, xmm5
vpxor xmm6, xmm6, xmm6
.LBB0_1: # =>This Inner Loop Header: Depth=1
vmovdqu ymm7, ymmword ptr [rdi + 8*rax]
vmovdqu ymm8, ymmword ptr [rdi + 8*rax + 32]
vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 64]
vmovdqu ymm10, ymmword ptr [rdi + 8*rax + 96]
vpand ymm11, ymm7, ymm1
vpshufb ymm11, ymm2, ymm11
vpsrlw ymm7, ymm7, 4
vpand ymm7, ymm7, ymm1
vpshufb ymm7, ymm2, ymm7
vpaddb ymm7, ymm11, ymm7
vpsadbw ymm7, ymm12, ymm7
vpand ymm11, ymm8, ymm1
vpshufb ymm11, ymm2, ymm11
vpsrlw ymm8, ymm8, 4
vpand ymm8, ymm8, ymm1
vpshufb ymm8, ymm2, ymm8
vpaddb ymm8, ymm8, ymm11
vpsadbw ymm8, ymm8, ymm12
vpand ymm11, ymm9, ymm1
vpshufb ymm11, ymm2, ymm11
vpsrlw ymm9, ymm9, 4
vpand ymm9, ymm9, ymm1
vpshufb ymm9, ymm2, ymm9
vpaddb ymm9, ymm9, ymm11
vpsadbw ymm9, ymm9, ymm12
vpand ymm11, ymm10, ymm1
vpshufb ymm11, ymm2, ymm11
vpsrlw ymm10, ymm10, 4
vpand ymm10, ymm10, ymm1
vpshufb ymm10, ymm2, ymm10
vpaddb ymm10, ymm10, ymm11
vpsadbw ymm10, ymm10, ymm12
vextracti128 xmm3, ymm7, 1
vpackusdw xmm3, xmm7, xmm3
vpaddd xmm0, xmm0, xmm3
vextracti128 xmm3, ymm8, 1
vpackusdw xmm3, xmm8, xmm3
vpaddd xmm4, xmm4, xmm3
vextracti128 xmm3, ymm9, 1
vpackusdw xmm3, xmm9, xmm3
vpaddd xmm5, xmm5, xmm3
vextracti128 xmm3, ymm10, 1
vpackusdw xmm3, xmm10, xmm3
vpaddd xmm6, xmm6, xmm3
add rax, 16
cmp rax, 1024
jne .LBB0_1
vpaddd xmm0, xmm4, xmm0
vpaddd xmm0, xmm5, xmm0
vpaddd xmm0, xmm6, xmm0
vpshufd xmm1, xmm0, 238 # xmm1 = xmm0[2,3,2,3]
vpaddd xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 85 # xmm1 = xmm0[1,1,1,1]
vpaddd xmm0, xmm0, xmm1
vmovd eax, xmm0
vzeroupper
ret
clang
's code is similar to gcc
when only -mpopcnt
is on, with a bit of unrolling.
count: # @count
xor ecx, ecx
xor eax, eax
.LBB0_1: # =>This Inner Loop Header: Depth=1
popcnt rdx, qword ptr [rdi + 8*rcx]
add edx, eax
popcnt rsi, qword ptr [rdi + 8*rcx + 8]
add esi, edx
popcnt rdx, qword ptr [rdi + 8*rcx + 16]
popcnt rax, qword ptr [rdi + 8*rcx + 24]
add edx, esi
add eax, edx
add rcx, 4
cmp rcx, 1024
jne .LBB0_1
ret
According to this document (https://www.agner.org/optimize/instruction_tables.pdf), popcnt
is a very cheap instruction on most architectures. Then why is clang
generating such a bloat to replace popcnt
when I clearly allowed to use it with -mpopcnt
? The optimization level was all set to -O3
.
Here is a link to godbolt (https://godbolt.org/z/4vWK33a7c).