I'm having trouble understanding a certain piece of code in assembly. The task is to find the dot product of 2 vectors using SSE arithmetic and the XMM registers. The approach is to read the vectors 4 floats at a time (meaning one xmm register will hold four in an iteration). End result of that is an xmm register, with each byte holding a sum of products (x1*y1 +...) of the given vectors.
What I don't get is the part that comes afterwards. All that is needed to sum these 'end' bytes altogether, basically sum the 4 bytes making the final register. I tried finding something on this, but to no prevail. What I'm given is beyond my understanding, I even tried writing every computation on paper, nothing made sense much. In the highlighted part, the actual sum is computed and stored in the lowest byte of the xmm0
. Any insight on this is welcome.
.intel_syntax noprefix
.data
two: .int 2
.text
.global dot_product
############################################################################
##
## Function:
##
## void dot_product(float *x, float *y, int n, float *r);
##
## calculates the dot product of x and y (n lengths) and stores the result
## in r
##
## -- float * x -- rdi --
## -- float * y -- rsi --
## -- int n -- rdx --
## -- float * r -- rcx --
##
############################################################################
dot_product:
enter 0, 0
mov r8, rcx
mov r9, rdx
mov rax, 1
cpuid
test rdx, 0x2000000
jz not_supported
mov rdx, rsp
and rsp, 0xfffffffffffffff0
sub rsp, 512
fxsave [rsp]
mov rcx, r9
xorps xmm0, xmm0
next_four:
cmp rcx, 4
jb next_one
movups xmm1, [rsi]
movups xmm2, [rdi]
mulps xmm1, xmm2
addps xmm0, xmm1
add rsi, 16
add rdi, 16
sub rcx, 4
jmp next_four
next_one:
jrcxz finish
movss xmm1, [rsi]
movss xmm2, [rdi]
mulss xmm1, xmm2
addss xmm0, xmm1
add rsi, 4
add rdi, 4
dec rcx
jmp next_one
finish:
#**summing the 4 bytes giving the actual dot product**
movhlps xmm1, xmm0
addps xmm0, xmm1
movaps xmm1, xmm0
shufps xmm1, xmm1, 0b01010101
addss xmm0, xmm1
movss [r8], xmm0
fxrstor [rsp]
mov rsp, rdx
done:
leave
ret
not_supported:
mov rax, 1
mov rbx, 1
int 0x80