3

I am writing basic linear algebra subprogram (BLAS) library. There is one issue with performance of fma code.

using System;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace LinearAlgebra
{
    public static class Level1
    {
        const int Vector256Length = 8;
        const int Vector256Block0 = Vector256Length * 0;
        const int Vector256Block1 = Vector256Length * 1;
        const int Vector256Block2 = Vector256Length * 2;

        public static unsafe void ApplyAffineTransformFma(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
        {
            var vm11 = Avx.BroadcastScalarToVector256(&m11);
            var vm12 = Avx.BroadcastScalarToVector256(&m12);
            var vm21 = Avx.BroadcastScalarToVector256(&m21);
            var vm22 = Avx.BroadcastScalarToVector256(&m22);
            var vdx = Avx.BroadcastScalarToVector256(&dx);
            var vdy = Avx.BroadcastScalarToVector256(&dy);

            int n = 0;
            for (; n <= N - Vector256Block2; n += Vector256Block2)
            {
                float* xn = X + n;
                float* yn = Y + n;

                var vx0 = Avx.LoadVector256(xn + Vector256Block0);
                var vx1 = Avx.LoadVector256(xn + Vector256Block1);
                var vy0 = Avx.LoadVector256(yn + Vector256Block0);
                var vy1 = Avx.LoadVector256(yn + Vector256Block1);

                var sx0 = Fma.MultiplyAdd(vm21, vy0, vdx);
                var sx1 = Fma.MultiplyAdd(vm21, vy1, vdx);
                var sy0 = Fma.MultiplyAdd(vm12, vx0, vdy);
                var sy1 = Fma.MultiplyAdd(vm12, vx1, vdy);

                vx0 = Fma.MultiplyAdd(vm11, vx0, sx0);
                vx1 = Fma.MultiplyAdd(vm11, vx1, sx1);
                vy0 = Fma.MultiplyAdd(vm22, vy0, sy0);
                vy1 = Fma.MultiplyAdd(vm22, vy1, sy1);

                Avx.Store(xn + Vector256Block0, vx0);
                Avx.Store(xn + Vector256Block1, vx1);
                Avx.Store(yn + Vector256Block0, vy0);
                Avx.Store(yn + Vector256Block1, vy1);
            }

            for (; n <= N - Vector256Length; n += Vector256Length)
            {
                float* xn = X + n;
                float* yn = Y + n;

                var vx = Avx.LoadVector256(xn);
                var vy = Avx.LoadVector256(yn);

                var sx = Fma.MultiplyAdd(vm21, vy, vdx);
                var sy = Fma.MultiplyAdd(vm12, vx, vdy);

                vx = Fma.MultiplyAdd(vm11, vx, sx);
                vy = Fma.MultiplyAdd(vm22, vy, sy);

                Avx.Store(xn, vx);
                Avx.Store(yn, vy);
            }

            ApplyAffineTransformOdd(N - n, X + n, Y + n, m11, m12, m21, m22, dx, dy);
        }

        public static unsafe void ApplyAffineTransformAvx(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
        {
            var vm11 = Avx.BroadcastScalarToVector256(&m11);
            var vm12 = Avx.BroadcastScalarToVector256(&m12);
            var vm21 = Avx.BroadcastScalarToVector256(&m21);
            var vm22 = Avx.BroadcastScalarToVector256(&m22);
            var vdx = Avx.BroadcastScalarToVector256(&dx);
            var vdy = Avx.BroadcastScalarToVector256(&dy);

            int n = 0;
            for (; n <= N - Vector256Length; n += Vector256Length)
            {
                float* xn = X + n;
                float* yn = Y + n;

                var vx = Avx.LoadVector256(xn);
                var vy = Avx.LoadVector256(yn);

                var x1 = Avx.Multiply(vm11, vx);
                var x2 = Avx.Multiply(vm21, vy);
                var y1 = Avx.Multiply(vm12, vx);
                var y2 = Avx.Multiply(vm22, vy);

                x1 = Avx.Add(x1, x2);
                y1 = Avx.Add(y1, y2);

                vx = Avx.Add(x1, vdx);
                vy = Avx.Add(y1, vdy);

                Avx.Store(xn, vx);
                Avx.Store(yn, vy);
            }

            ApplyAffineTransformOdd(N - n, X + n, Y + n, m11, m12, m21, m22, dx, dy);
        }

        public static unsafe void ApplyAffineTransformOdd(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
        {
            for (int n = 0; n < N; n++)
            {
                float xn = X[n];
                float yn = Y[n];

                X[n] = m11 * xn + m21 * yn + dx;
                Y[n] = m12 * xn + m22 * yn + dy;
            }
        }
    }
}

Here are the benchmark results. The testing machine is Intel Core i5-7200U (Skylake architecture). The code was compiled in x64 mode with .NET 5.

Method N Mean Error StdDev Min Max Median Ratio
AffineTransformFma 68 1.805 us 0.0018 us 0.0017 us 1.802 us 1.808 us 1.805 us 1.00
AffineTransformAvx 68 1.152 us 0.0158 us 0.0140 us 1.137 us 1.184 us 1.150 us 0.64
AffineTransformFma 1159 25.966 us 0.1048 us 0.0929 us 25.843 us 26.114 us 25.999 us 1.00
AffineTransformAvx 1159 14.070 us 0.0174 us 0.0145 us 14.051 us 14.104 us 14.066 us 0.54
AffineTransformFma 4101 90.094 us 0.1041 us 0.0974 us 89.865 us 90.214 us 90.140 us 1.00
AffineTransformAvx 4101 48.180 us 0.0933 us 0.0779 us 48.089 us 48.320 us 48.149 us 0.53
AffineTransformFma 16389 360.215 us 0.2143 us 0.1789 us 359.840 us 360.456 us 360.266 us 1.00
AffineTransformAvx 16389 191.222 us 0.3403 us 0.3183 us 190.660 us 191.765 us 191.170 us 0.53
AffineTransformFma 32773 725.299 us 0.8294 us 0.7758 us 723.925 us 726.415 us 725.374 us 1.00
AffineTransformAvx 32773 379.920 us 0.8381 us 0.6999 us 378.887 us 381.257 us 379.776 us 0.52

Benchmark code:

using BenchmarkDotNet.Attributes;
using LinearAlgebra;
using LinearAlgebra.Tests;

namespace Benchmarks.LinearAlgebra
{
    [MinColumn, MaxColumn, MedianColumn]
    public unsafe class TransformBenchmark
    {
        [Params(68, 1159, 4101, 16389, 32773)]
        public int N { get; set; }

        public float[] X;
        public float[] Y;

        const float m11 = 1f;
        const float m12 = 0f;
        const float m21 = 1f;
        const float m22 = 1f;
        const float dx = 0f;
        const float dy = 0f;

        [GlobalSetup]
        public void GlobalSetup()
        {
            X = LinAlgGenerator.GetRandomVector(N);
            Y = LinAlgGenerator.GetRandomVector(N);
        }

        [Benchmark(Baseline = true)]
        public void AffineTransformFma()
        {
            fixed (float* ptrX = X, ptrY = Y)
            {
                Level1.ApplyAffineTransformFma(N, ptrX, ptrY, m11, m12, m21, m22, dx, dy);
            }
        }
        
        [Benchmark]
        public void AffineTransformAvx()
        {
            fixed (float* ptrX = X, ptrY = Y)
            {
                Level1.ApplyAffineTransformAvx(N, ptrX, ptrY, m11, m12, m21, m22, dx, dy);
            }
        }
    }
}

Which is kind of strange. Fma code is almost two times slower than avx! Based on https://software.intel.com/sites/landingpage/IntrinsicsGuide/ Fma.MultiplyAdd method has latency=4 and throughput=0.5 which means that 4 independent operations can be executed in 4x4x0.5=8 cycles (assuming that processor has only one fma port which is of corse not true).

Fma version of the method uses two such blocks so overall performance of one iteration with loading (latency=7 and throughput=0.5) and saving (latency=5 and throughput=1) is 4x7x0.5 + 2x4x4x0.5 + 4x5x1 = 50 cycles per 16 floating-point numbers.

Avx version of the code uses 4 independent multiply operations (latency=4 and throughput=0.5) and two blocks of two independent add operations (latency=4 and throughput=0.5). So with loading and saving overall performance should be: 2x7x0.5 + 4x4x0.5 + 2x4x0.5 + 2x4x0.5 +2x5x1 = 33 cycles per 8 floating-point numbers (66 cycles per 16 floating point numbers).

Theoretically avx code should be slower than fma code. What am I doing wrong? The code is not using more than 16 256-bit registers. I also tried to test Fma version of the code without pipelining - results were the same (avx performs better).

Update It looks like that was just a bug in BenchmarkDotNet. After several launches of benchmark code that I wrote without BenchmarkDotNet framework I got the following results:

Method N Time Ratio
AffineTransformAvx 68 27.24 ns 1.00
AffineTransformFma 68 29.19 ns 1.07
AffineTransformAvx 1159 258.33 ns 1.00
AffineTransformFma 1159 188.40 ns 0.73
AffineTransformAvx 4101 818.48 ns 1.00
AffineTransformFma 4101 582.80 ns 0.71
AffineTransformAvx 16389 4,263.31 ns 1.00
AffineTransformFma 16389 2,959.02 ns 0.69
AffineTransformAvx 32773 9,782.48 ns 1.00
AffineTransformFma 32773 6,943.25 ns 0.71

And yes - the fma version is faster by 30% in most cases! Results of benchmarks does not depends on their order (first avx and then fma or vice versa)

  • The question could be more complete if you show the resulting assembly code (real assembly, not IL) – Alex Guteniev Oct 08 '21 at 12:22
  • 1
    4 independent SIMD FMA operations on Skylake takes 2 cycles of throughput resources: 4 x 0.5. (Each one produces a result 4 cycles after it starts). Those are *reciprocal* throughputs. It's 0.5 specifically because Skylake has two FMA ports, and each is fully pipelined, able to start a new FMA every cycle. The latency bandwidth product 4 / 0.5 = 8 independent FMAs in flight if you don't bottleneck on something else (like latency of a loop-carried dep chain, but I don't think there is one involving FMAs; the vars used across loop iterations are loop-invariant, not updated inside the loop) – Peter Cordes Oct 08 '21 at 12:32
  • 1
    How did you benchmark this? Could warm-up effects be causing the difference? Like if you test FMA first and it touches the output memory for the first time? [Idiomatic way of performance evaluation?](https://stackoverflow.com/q/60291987) Or the clock speed isn't ramped up to max turbo yet? The difference you're seeing is probably not explained by the functions themselves; the FMA version should be faster. – Peter Cordes Oct 08 '21 at 12:37
  • I've tested it with BenchmarkDotNet. I tried to test the avx version first. The results are the same (fma is two times slower) – Станислав Герасименко Oct 08 '21 at 12:50
  • I added benchmark code to the question. The identic transform was used to not cause problems with speed of subnormals and normal numbers. Numbers in arrays X and Y are all less than 1 in magnitude. – Станислав Герасименко Oct 08 '21 at 12:55
  • 1
    With your constants being very simple, like 1.0 and 0.0, it might be possible that .NET optimized away some multiplies and adds, but for some reason didn't do the same for FMA? So perhaps making it take even fewer instructions than the FMA version. Have you tried any other set of constants with BenchmarkDotNet? You shouldn't have subnormal numbers if you use constants like 1.23, 1.24, 1.26, etc. when your input arrays are all mag<1, especially if they're non-negative. You're not iterating this transform on the same point multiple times anyway. – Peter Cordes Oct 08 '21 at 20:26
  • Yes, you are right. I have tried m11=m22=MathF.BitIncrement(1) and m12=m21=dx=dy=float.Epsilon in the last benchmark so compiler could not optimize addition of zero vector and multiplication by one vector. The last benchmark was runned with this constants. I couldnt have use constants like 1.24, etc. because then i should use [IterationSetup] to prevent X and Y changing, which can affect on results. – Станислав Герасименко Oct 09 '21 at 06:49
  • Seems like the problem in the first version of benchmarks was not in constants, because new benchmark runs the same time with simple (1,0) and not (epsilon, bitincrenent(1)) constants. I also run benchmarks with random params every iteration. Results were the same. Look like RyuJit couldnt optimize patterns like addition of zero vector when this vector is broadcasted from scalar (not set as Vector255.Zero). Anyway, in new benchmark Fma version is faster than Avx and i think that was just a bug in BenchmarkDotNet (just have a look at time magnitudes in update). – Станислав Герасименко Oct 09 '21 at 07:12

0 Answers0