Fastest way to multiply and sum/add two arrays (dot product) - unaligned surprisingly faster than FMA

Question

Hi I have the following code:

public unsafe class MultiplyAndAdd : IDisposable
{
    float[] rawFirstData = new float[1024];
    float[] rawSecondData = new float[1024];

    static int alignment = 32;
    float[] alignedFirstData = new float[1024 + alignment / sizeof(float)];
    int alignedFirstDataOffset;
    GCHandle alignedFirstDataHandle;
    float* alignedFirstDataPointer;
    float[] alignedSecondData = new float[1024 + alignment / sizeof(float)];
    int alignedSecondDataOffset;
    GCHandle alignedSecondDataHandle;
    float* alignedSecondDataPointer;

    public IEnumerable<object[]> Data { get; set; }

    public void Dispose()
    {
        this.alignedFirstDataHandle.Free();
        this.alignedSecondDataHandle.Free();
    }

    //Calculate the offset that needs to be applied to ensure that the array is aligned with 32.
    private int CalculateAlignmentOffset(GCHandle handle)
    {
        var handlePointer = handle.AddrOfPinnedObject().ToInt64();
        long lPtr2 = (handlePointer + alignment - 1) & ~(alignment - 1);
        
        return (int)(lPtr2 - handlePointer);
    }

    public MultiplyAndAdd()
    {
        Random random = new Random(1055);
        for (var i = 0; i < 1024; i++)
        {
            rawFirstData[i] = (float)random.NextDouble() * 4f - 2f;
            rawSecondData[i] = (float)random.NextDouble() * 4f - 2f;
        }

        alignedFirstDataHandle = GCHandle.Alloc(alignedFirstData, GCHandleType.Pinned);
        alignedFirstDataOffset = CalculateAlignmentOffset(alignedFirstDataHandle);
        alignedFirstDataPointer = (float*)(alignedFirstDataHandle.AddrOfPinnedObject() + alignedFirstDataOffset);

        alignedSecondDataHandle = GCHandle.Alloc(alignedSecondData, GCHandleType.Pinned);
        alignedSecondDataOffset = CalculateAlignmentOffset(alignedSecondDataHandle);
        alignedSecondDataPointer = (float*)(alignedSecondDataHandle.AddrOfPinnedObject() + alignedSecondDataOffset);

        for (var i = 0; i < 1024; i++)
        {
            alignedFirstData[i + alignedFirstDataOffset / sizeof(float)] = rawFirstData[i];
            alignedSecondData[i + alignedSecondDataOffset / sizeof(float)] = rawSecondData[i];
        }

        Data = new[] { 
            //7, 
            8, 
            //11, 
            //16, 
            20, 
            //30, 
            32, 
            //40, 
            50 }.Select(x => new object[] { x }).ToList();
    }

    public void Validate()
    {
        for(var i = 0; i < 1024; i++)
        {
            if (rawFirstData[i] != alignedFirstData[i + alignedFirstDataOffset / sizeof(float)])
            {
                throw new InvalidOperationException("Diff found!");
            }
            if (rawFirstData[i] != *(alignedFirstDataPointer + i))
            {
                throw new InvalidOperationException("Diff found!");
            }

            if (rawSecondData[i] != alignedSecondData[i + alignedSecondDataOffset / sizeof(float)])
            {
                throw new InvalidOperationException("Diff found!");
            }
            if (rawSecondData[i] != *(alignedSecondDataPointer + i))
            {
                throw new InvalidOperationException("Diff found!");
            }
        }

        Action<string, float, float> ensureAlmostSame = delegate (string name, float normal, float other)
        {
            var diff = MathF.Abs(normal - other);
            if (diff > 0.00001)
            {
                throw new InvalidOperationException($"The difference between normal and {name} was {diff}");
            }
        };
        foreach (var count in Data.Select(x => (int)x[0]))
        {
            var normal = Normal(count);
            var vectorUnaligned = VectorUnaligned(count);
            ensureAlmostSame(nameof(vectorUnaligned), normal, vectorUnaligned);
            var vectorAligned = VectorAligned(count);
            ensureAlmostSame(nameof(vectorAligned), normal, vectorAligned);
            var avx2Aligned = Avx2Aligned(count);
            ensureAlmostSame(nameof(avx2Aligned), normal, avx2Aligned);
            var fmaAligned = FmaAligned(count);
            ensureAlmostSame(nameof(fmaAligned), normal, fmaAligned);
        }
    }

    //[Benchmark(Baseline = true)]
    [ArgumentsSource(nameof(Data))]
    public float Normal(int count)
    {
        var result = 0f;
        for (var i = 0; i < count; i++)
        {
            result += rawFirstData[i] * rawSecondData[i];
        }
        return result;
    }

    [Benchmark]
    [ArgumentsSource(nameof(Data))]
    public float VectorUnaligned(int count)
    {
        int vectorSize = Vector<float>.Count;
        var accVector = Vector<float>.Zero;
        int i = 0;
        for (; i <= count - vectorSize; i += vectorSize)
        {
            var firstVector = new Vector<float>(rawFirstData, i);
            var secondVector = new Vector<float>(rawSecondData, i);
            var v = Vector.Multiply(firstVector, secondVector);

            accVector = Vector.Add(v, accVector);
        }
        float result = Vector.Sum(accVector);
        for (; i < count; i++)
        {
            result += rawFirstData[i] * rawSecondData[i];
        }
        return result;
    }

    //[Benchmark]
    [ArgumentsSource(nameof(Data))]
    public float VectorAligned(int count)
    {
        int vectorSize = Vector<float>.Count;
        var accVector = Vector<float>.Zero;
        int i = 0;
        for (; i <= count - vectorSize; i += vectorSize)
        {
            var firstVector = new Vector<float>(alignedFirstData, alignedFirstDataOffset / sizeof(float) + i);
            var secondVector = new Vector<float>(alignedSecondData, alignedSecondDataOffset / sizeof(float) + i);
            var v = Vector.Multiply(firstVector, secondVector);

            accVector = Vector.Add(v, accVector);
        }
        float result = Vector.Sum(accVector);
        for (; i < count; i++)
        {
            result += rawFirstData[i] * rawSecondData[i];
        }
        return result;
    }

    [Benchmark]
    [ArgumentsSource(nameof(Data))]
    public float Avx2Aligned(int count)
    {
        int vectorSize = Vector256<float>.Count;
        var accumulationVector = Vector256<float>.Zero;
        var i = 0;
        for (;i <= count - vectorSize; i += vectorSize)
        {
            var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
            var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
            var resultVector = Avx2.Multiply(firstVector, secondVector);
            accumulationVector = Avx2.Add(accumulationVector, resultVector);
        }
        var result = 0f;
        var temp = stackalloc float[vectorSize];
        Avx2.Store(temp, accumulationVector);
        for (int j = 0; j < vectorSize; j++)
        {
            result += temp[j];
        }
        for (; i < count; i++)
        {
            result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
        }
        return result;
    }

    [Benchmark]
    [ArgumentsSource(nameof(Data))]
    public float FmaAligned(int count)
    {
        int vectorSize = Vector256<float>.Count;
        var accumulationVector = Vector256<float>.Zero;
        var i = 0;
        for (; i <= count - vectorSize; i += vectorSize)
        {
            var firstVector = Avx2.LoadAlignedVector256(alignedFirstDataPointer + i);
            var secondVector = Avx2.LoadAlignedVector256(alignedSecondDataPointer + i);
            accumulationVector = Fma.MultiplyAdd(firstVector, secondVector, accumulationVector);
        }
        var result = 0f;
        var temp = stackalloc float[vectorSize];
        Avx2.Store(temp, accumulationVector);
        for (int j = 0; j < vectorSize; j++)
        {
            result += temp[j];
        }
        for (; i < count; i++)
        {
            result += *(alignedFirstDataPointer + i) * *(alignedSecondDataPointer + i);
        }
        return result;
    }
}

If I run this benchmark on my Zen3 CPU, I get the following result:

BenchmarkDotNet=v0.13.1, OS=Windows 10.0.19042.1586 (20H2/October2020Update)
AMD Ryzen 5 5600X, 1 CPU, 12 logical and 6 physical cores
.NET SDK=6.0.200
  [Host]     : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT
  DefaultJob : .NET 6.0.2 (6.0.222.6406), X64 RyuJIT


|          Method | count |     Mean |     Error |    StdDev |
|---------------- |------ |---------:|----------:|----------:|
| VectorUnaligned |     8 | 1.231 ns | 0.0093 ns | 0.0082 ns |
|     Avx2Aligned |     8 | 3.576 ns | 0.0208 ns | 0.0195 ns |
|      FmaAligned |     8 | 3.408 ns | 0.0259 ns | 0.0243 ns |
| VectorUnaligned |    20 | 4.428 ns | 0.0146 ns | 0.0122 ns |
|     Avx2Aligned |    20 | 6.321 ns | 0.0578 ns | 0.0541 ns |
|      FmaAligned |    20 | 5.845 ns | 0.0121 ns | 0.0113 ns |
| VectorUnaligned |    32 | 4.022 ns | 0.0098 ns | 0.0087 ns |
|     Avx2Aligned |    32 | 5.205 ns | 0.0161 ns | 0.0150 ns |
|      FmaAligned |    32 | 4.776 ns | 0.0265 ns | 0.0221 ns |
| VectorUnaligned |    50 | 6.901 ns | 0.0337 ns | 0.0315 ns |
|     Avx2Aligned |    50 | 7.207 ns | 0.0476 ns | 0.0422 ns |
|      FmaAligned |    50 | 7.246 ns | 0.0169 ns | 0.0158 ns |

Why is VectorUnaligned so much faster that the more optimized AVX2 and Fma code?

If I enable VectorAligned its also slower than VectorUnaligned.

You're only using a single accumulator, and `vaddpd` has lower latency (3c) on Zen3 than `fmadd...pd` (4c). And you're only using a single accumulator, not unrolling to hide FP latency. But that doesn't explain the unaligned. Perhaps because your arrays are so tiny, and your horizontal sum so naive (just looping in order, not shuffling in halves), that 128-bit vectors win? Or maybe something about the complicated way you over-allocate and then take pointers is defeating some optimization there vs. using `rawFirstData`? — Peter Cordes, Mar 26 '22 at 20:09
Re: unrolling with multiple accumulators to hide FMA latency on larger arrays: [Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? (Unrolling FP loops with multiple accumulators)](https://stackoverflow.com/q/45113527). Re: hsum at the end by shuffling and adding: [Fastest way to do horizontal SSE vector sum (or other reduction)](https://stackoverflow.com/a/35270026) . — Peter Cordes, Mar 26 '22 at 20:16
Re: Zen FMA vs. mul/add throughput and latency: [GEMM kernel implemented using AVX2 is faster than AVX2/FMA on a Zen 2 CPU](https://stackoverflow.com/q/70340734) - it's not like Intel; separate mul/add just cost front-end throughput, but the execution units are on different ports so it can sustain 2 mul and 2 add per clock (for SIMD vectors of 2 or 4 doubles each). (When not bound on latency like you are here! You bottleneck purely on add or FMA latency, not throughput. Except your arrays are tiny so OoO exec can overlap work, maybe even across invocations by the benchmark framework?) — Peter Cordes, Mar 26 '22 at 20:17
@PeterCordes I haven't had time to try unrolling with multiple accumulation variables yet, but the `Vector` code is 256 just as the handcoded AVX2 is, `Vector` is just a "safe" class one can use in C# to avoid unsafe code, so I don't understand why its faster... — Peter, Mar 27 '22 at 16:53
Oh right, C# SIMD Vector picks a size depending on the target machine, so it should be picking 256 here. IDK, you could try looking at the resulting asm, maybe on https://sharplab.io/ to see if there are any differences, or maybe it's something about the data. Note that 256-bit AVX FP math operations only require AVX1, not AVX2, so it's weird to me that `Avx2.Multiply` is an FP multiply. Maybe that's normal? You are checking the results so it's not actually doing integer `vpmulld` on the bit-patterns or anything. Whatever's going on here, it's likely a C# thing, not a CPU/asm thing. — Peter Cordes, Mar 27 '22 at 19:24
Your right the multiply operation is defined on Avx not Avx2, its just that Avx2 inherits Avx so I can find the function there. I'm 99% sure its something about C# that I don't understand just soo confused the `VectorUnaligned` should be about the same thing as `Avx2Aligned` minus the alignment. — Peter, Mar 27 '22 at 20:18

aepot · Answer 1 · 2022-06-27T20:29:59.883

Not an answer but a tip for "Fastest way to multiply".

Sorry, I don't know how to deal with alignment but you missed the option of casting the array type. It might be faster than picking floats from source arrays in the loop.

int vectorSize = Vector<float>.Count;
var accVector = Vector<float>.Zero;

Span<Vector<float>> firstVectors = MemoryMarshal.Cast<float, Vector<float>>(rawFirstData);
Span<Vector<float>> secondVectors = MemoryMarshal.Cast<float, Vector<float>>(rawSecondData);

for (int i = 0; i < firstVectors.Length; i++)
{
    accVector += Vector.Multiply(firstVectors[i], secondVectors[i]);
}

float result = Vector.Sum(accVector);
for (int i = firstVectors.Length * vectorSize; i < count; i++)
{
    result += rawFirstData[i] * rawSecondData[i];
}

It makes a bit more JIT Assembler code than VectorUnaligned method but the first loop looks like twice shorter because if contains only one out-of-range check instead of 4. Give it a chance to test with different types of vectors and alignment.

this one

L0080: movsxd rsi, r11d
L0083: shl rsi, 5
L0087: vmovupd ymm1, [r8+rsi]
L008d: cmp r11d, r9d
L0090: jae short L00ff ; throw out-of-range
L0092: vmovupd ymm2, [r10+rsi]
L0098: vmulps ymm1, ymm1, ymm2
L009c: vaddps ymm0, ymm0, ymm1
L00a0: inc r11d
L00a3: cmp r11d, edx
L00a6: jl short L0080

VectorUnaligned loop, looks like JIT failed to optimize it

L0020: mov r8, rdx
L0023: cmp eax, [r8+8]
L0027: jae L00c3 ; throw out-of-range
L002d: lea r9d, [rax+7]
L0031: cmp r9d, [r8+8]
L0035: jae L00c3 ; throw out-of-range
L003b: vmovupd ymm1, [r8+rax*4+0x10]
L0042: mov r8, [rcx+0x10]
L0046: cmp eax, [r8+8]
L004a: jae L00c3 ; throw out-of-range
L0050: cmp r9d, [r8+8]
L0054: jae short L00c3 ; throw out-of-range
L0056: vmovupd ymm2, [r8+rax*4+0x10]
L005d: vmulps ymm1, ymm1, ymm2
L0061: vaddps ymm0, ymm1, ymm0
L0065: add eax, 8
L0068: mov r8d, [rdx+8]
L006c: sub r8d, 8
L0070: cmp r8d, eax
L0073: jge short L0020

Compiled code got from https://sharplab.io/. Real generated code may vary from CPU to CPU because Vector<T>.Count on certain CPUs may vary.

Fastest way to multiply and sum/add two arrays (dot product) - unaligned surprisingly faster than FMA

1 Answers1