Questions
- What is the purpose or intention of a MoveMask?
- What's the best place to learn how to use x86/x86-64 assembly/SSE/AVX?
- Could I have written my code more efficiently?
Reason for Questions
I have an function written in F# for .NET that uses SSE2. I've written the same thing using AVX2 but the underlying question is the same. What is the intended purpose of a MoveMask
? I know that it works for my purposes, I want to know why.
I am iterating through two 64-bit float arrays, a
and b
, testing that all of their values match. I am using the CompareEqual
method (which I believe is wrapping a call to __m128d _mm_cmpeq_pd
) to compare several values at a time. I then compare that result with a Vector128
of 0.0
64-bit float. My reasoning is that the result of CompareEqual
will give a 0.0
value in the cases where the values don't match. Up to this point, it makes sense.
I then use the Sse2.MoveMask
method on the result of the comparison with the zero vector. I've previously worked on using SSE
and AVX
for matching and I saw examples of people using MoveMask
for the purpose for testing for non-zero values. I believe this method is using the int _mm_movemask_epi8
Intel intrinsic. I have included the F# code and the assembly that is JITed.
Is this really the intention of a MoveMask
or is it just a happy coincidence it works for these purposes. I know my code works, I want to know WHY it works.
F# Code
#nowarn "9" "51" "20" // Don't want warnings about pointers
open System
open FSharp.NativeInterop
open System.Runtime.Intrinsics.X86
open System.Runtime.Intrinsics
open System.Collections.Generic
let sseFloatEquals (a: array<float>) (b: array<float>) =
if a.Length = b.Length then
let mutable result = true
let mutable idx = 0
if a.Length > 3 then
let lastBlockIdx = a.Length - (a.Length % Vector128<float>.Count)
let aSpan = a.AsSpan ()
let bSpan = b.AsSpan ()
let aPointer = && (aSpan.GetPinnableReference ())
let bPointer = && (bSpan.GetPinnableReference ())
let zeroVector = Vector128.Create 0.0
while idx < lastBlockIdx && result do
let aVector = Sse2.LoadVector128 (NativePtr.add aPointer idx)
let bVector = Sse2.LoadVector128 (NativePtr.add bPointer idx)
let comparison = Sse2.CompareEqual (aVector, bVector)
let zeroTest = Sse2.CompareEqual (comparison, zeroVector)
// The line I want to understand
let matches = Sse2.MoveMask (zeroTest.AsByte ())
if matches <> 0 then
result <- false
idx <- idx + Vector128.Count
while idx < a.Length && idx < b.Length && result do
if a.[idx] <> b.[idx] then
result <- false
idx <- idx + 1
result
else
false
Emitted Assembly
; Core CLR 5.0.921.35908 on amd64
_.sseFloatEquals$cont@11(System.Double[], System.Double[], Microsoft.FSharp.Core.Unit)
L0000: push rdi
L0001: push rsi
L0002: push rbp
L0003: push rbx
L0004: sub rsp, 0x28
L0008: vzeroupper
L000b: mov eax, 1
L0010: xor r8d, r8d
L0013: mov r9d, [rcx+8]
L0017: cmp r9d, 3
L001b: jle short L008e
L001d: mov r10d, r9d
L0020: and r10d, 1
L0024: mov r11d, r9d
L0027: sub r11d, r10d
L002a: lea r10, [rcx+0x10]
L002e: mov esi, r9d
L0031: test rdx, rdx
L0034: jne short L003c
L0036: xor edi, edi
L0038: xor ebx, ebx
L003a: jmp short L0043
L003c: lea rdi, [rdx+0x10]
L0040: mov ebx, [rdx+8]
L0043: xor ebp, ebp
L0045: test esi, esi
L0047: je short L004c
L0049: mov rbp, r10
L004c: xor r10d, r10d
L004f: test ebx, ebx
L0051: je short L0056
L0053: mov r10, rdi
L0056: vxorps xmm0, xmm0, xmm0
L005a: cmp r8d, r11d
L005d: jge short L008e
L005f: mov esi, eax
L0061: test esi, esi
L0063: je short L008e
L0065: movsxd rsi, r8d
L0068: vmovupd xmm1, [rbp+rsi*8]
L006e: vmovupd xmm2, [r10+rsi*8]
L0074: vcmpeqpd xmm1, xmm1, xmm2
L0079: vcmpeqpd xmm1, xmm1, xmm0
L007e: vpmovmskb esi, xmm1
L0082: test esi, esi
L0084: je short L0088
L0086: xor eax, eax
L0088: add r8d, 4
L008c: jmp short L005a
L008e: cmp r9d, r8d
L0091: jle short L00c8
L0093: cmp [rdx+8], r8d
L0097: jle short L00c8
L0099: mov r10d, eax
L009c: test r10d, r10d
L009f: je short L00c8
L00a1: cmp r8d, r9d
L00a4: jae short L00d1
L00a6: movsxd r10, r8d
L00a9: vmovsd xmm0, [rcx+r10*8+0x10]
L00b0: cmp r8d, [rdx+8]
L00b4: jae short L00d1
L00b6: vucomisd xmm0, [rdx+r10*8+0x10]
L00bd: jp short L00c1
L00bf: je short L00c3
L00c1: xor eax, eax
L00c3: inc r8d
L00c6: jmp short L008e
L00c8: add rsp, 0x28
L00cc: pop rbx
L00cd: pop rbp
L00ce: pop rsi
L00cf: pop rdi
L00d0: ret
L00d1: call 0x00007ffcef38a370
L00d6: int3
_.sseFloatEquals(System.Double[], System.Double[])
L0000: mov r8d, [rcx+8]
L0004: cmp r8d, [rdx+8]
L0008: jne short L0012
L000a: xor r8d, r8d
L000d: jmp 0x00007ffc99000480
L0012: xor eax, eax
L0014: ret