I have this AVX code that runs much slower than the SSE4 version and I'm trying to figure out why.
This smallish loop in SSE4:
(asm by gcc 13.1)
.L6:
movaps xmm1, XMMWORD PTR [rbx+rsi]
movaps xmm3, XMMWORD PTR [rbp+0+rsi]
movaps xmm2, xmm4
lea eax, [0+rcx*4]
movd xmm0, eax
add rcx, 1
add rsi, 16
addps xmm3, xmm1
cmpleps xmm1, xmm5
pshufd xmm0, xmm0, 0
paddd xmm0, xmm6
cmpleps xmm2, xmm3
pand xmm1, xmm2
movmskps edi, xmm1
mov rax, rdi
sal rdi, 4
pshufb xmm0, XMMWORD PTR shufmasks.0[rdi]
popcnt eax, eax
movups XMMWORD PTR [r8], xmm0
lea r8, [r8+rax*4]
cmp rcx, rdx
jne .L6
Because AVX doesn't have 8-wide integer, I use 8-wide registers so that I can do the float math 8-wide, but for the integer stuff, I split them on-place to compute them in two 4-wide instructions and put them back together:
(asm by gcc 13.1)
.L6:
lea eax, [0+rcx*8]
vmovaps ymm7, YMMWORD PTR [rbx+rsi]
xor r10d, r10d
add rcx, 1
vmovd xmm1, eax
xor eax, eax
vpshufd xmm0, xmm1, 0
vcmpleps ymm6, ymm7, ymm3
vmovdqa xmm1, xmm0
vpaddd xmm0, xmm0, xmm5
vpaddd xmm1, xmm1, xmm2
vinsertf128 ymm0, ymm0, xmm1, 0x1
vaddps ymm1, ymm7, YMMWORD PTR [r12+rsi]
add rsi, 32
vcmpleps ymm1, ymm4, ymm1
vandps ymm1, ymm1, ymm6
vmovaps xmm6, xmm0
vextractf128 xmm0, ymm0, 0x1
vmovaps xmm7, xmm1
vextractf128 xmm1, ymm1, 0x1
vmovmskps edx, xmm7
popcnt eax, edx
sal rdx, 4
vpshufb xmm6, xmm6, XMMWORD PTR shufmasks.0[rdx]
vmovmskps edx, xmm1
popcnt r10d, edx
sal rdx, 4
vmovdqa XMMWORD PTR [rsp+32], xmm6
vpshufb xmm0, xmm0, XMMWORD PTR shufmasks.0[rdx]
movsx rdx, eax
add eax, r10d
vmovups XMMWORD PTR [rsp+32+rdx*4], xmm0
vmovdqa ymm6, YMMWORD PTR [rsp+32]
cdqe
vmovdqu YMMWORD PTR [rdi], ymm6
lea rdi, [rdi+rax*4]
cmp rcx, r8
jne .L6
(asm by gcc 12.2.0)
.L4:
vmovaps ymm11, YMMWORD PTR [rbx+rsi]
vaddps ymm12, ymm11, YMMWORD PTR [r12+rsi]
xor eax, eax
add rsi, 32
lea r11d, 0[0+rcx*8]
add rcx, 1
vcmpleps ymm14, ymm11, ymm3
vmovd xmm1, r11d
xor r11d, r11d
vcmpleps ymm13, ymm4, ymm12
vpshufd xmm0, xmm1, 0
vpaddd xmm7, xmm0, xmm5
vpaddd xmm9, xmm0, xmm8
vinsertf128 ymm10, ymm7, xmm9, 0x1
vandps ymm15, ymm13, ymm14
vextractf128 xmm0, ymm10, 0x1
vextractf128 xmm1, ymm15, 0x1
vmovmskps r13d, xmm15
vmovmskps r14d, xmm1
popcnt eax, r13d
sal r13, 4
movsx rdx, eax
popcnt r11d, r14d
sal r14, 4
vpshufb xmm7, xmm10, XMMWORD PTR [r8+r13]
add eax, r11d
vpshufb xmm9, xmm0, XMMWORD PTR [r8+r14]
vmovdqa XMMWORD PTR 32[rsp], xmm7
cdqe
vmovups XMMWORD PTR 32[rsp+rdx*4], xmm9
vmovdqa ymm10, YMMWORD PTR 32[rsp]
vmovdqu YMMWORD PTR [rdi], ymm10
lea rdi, [rdi+rax*4]
cmp rcx, r9
jne .L4
The AVX version is 2x longer, but does 2x more work, so it should equalize. But when I measure this code, the AVX version runs as slow as the scalar version. Does anything stand out to being particularly slow in the AVX loop? Is mixing 4-wide and 8-wide instructions in a same loop known to harm performance that bad? Or is it something else? Is that something that I can fix to make the AVX version at least catch up to the SSE4 version?