This is my source code in Assembly for strlen using AVX512BW
strlen_avx512:
mov rax, rdi
test al, 63 ; aligned ?
jz .aligned_str
vmovdqu64 zmm0, zword [rax] ; unaligned load
vptestnmb k0, zmm0, zmm0
kortestq k0, k0
jz .do_align_64
kmovq rcx, k0
tzcnt rax, rcx
vzeroupper
ret
.do_align_64:
add rax, 63
and rax, -64
.aligned_str:
vmovdqa64 zmm0, ZWORD [rax]
vmovdqa64 zmm1, ZWORD [rax+64]
vmovdqa64 zmm2, ZWORD [rax+128]
vmovdqa64 zmm3, ZWORD [rax+192]
vpminub zmm4, zmm0, zmm1
vpminub zmm5, zmm2, zmm3
vpminub zmm5, zmm5, zmm4
vptestnmb k0, zmm5, zmm5 ; 0x00 ?
kortestq k0, k0
jnz .done
add rax, 256
jmp .aligned_str
.done:
sub rax, edi
vptestnmb k0, zmm0, zmm0
kortestq k0, k0
jnz .end
vptestnmb k0, zmm1, zmm1
kortestq k0, k0
jnz .end1
vptestnmb k0, zmm2, zmm2
kortestq k0, k0
jnz .end2
vptestnmb k0, zmm4, zmm4
add rax, 192
.end:
kmovq rcx, k0
tzcnt rcx, rcx
add rax, rcx
vzeroupper
ret
.end1:
add rax, 64
jmp .end
.end2:
add rax, 128
jmp .end
This function works without any problem but it's not gives me the speed that i expected ! i wrote an AVX2 version of this function (with vpminub (Same as this function)) with ymm registers and speed was very amazing (i called that function 1000000 times) and execution time was 4s but in this function, when i called it 1000000 times, the exection time was 3s (2.9s) i expect it to be something like 2 seconds or ... but it's just 1.5 time faster not 2x faster !
1 - i think this function needs some optimization to speed up ... is it possible to do something else for this function to speed up ?
2 - another question ... why vzeroupper ????!! i generated some avx512 code with gcc '-march=skylake-avx512' flag and gcc puts vzeroupper to the code so i added it to my source code too but why !!!?
3 - and other quesition about this function ... i saw some functions that wrote 'cross_page check' and ... is there any thing else that i have to check in this function (anything about page check and ...) ???