I am writing basic linear algebra subprogram (BLAS) library. There is one issue with performance of fma code.
using System;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace LinearAlgebra
{
public static class Level1
{
const int Vector256Length = 8;
const int Vector256Block0 = Vector256Length * 0;
const int Vector256Block1 = Vector256Length * 1;
const int Vector256Block2 = Vector256Length * 2;
public static unsafe void ApplyAffineTransformFma(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
{
var vm11 = Avx.BroadcastScalarToVector256(&m11);
var vm12 = Avx.BroadcastScalarToVector256(&m12);
var vm21 = Avx.BroadcastScalarToVector256(&m21);
var vm22 = Avx.BroadcastScalarToVector256(&m22);
var vdx = Avx.BroadcastScalarToVector256(&dx);
var vdy = Avx.BroadcastScalarToVector256(&dy);
int n = 0;
for (; n <= N - Vector256Block2; n += Vector256Block2)
{
float* xn = X + n;
float* yn = Y + n;
var vx0 = Avx.LoadVector256(xn + Vector256Block0);
var vx1 = Avx.LoadVector256(xn + Vector256Block1);
var vy0 = Avx.LoadVector256(yn + Vector256Block0);
var vy1 = Avx.LoadVector256(yn + Vector256Block1);
var sx0 = Fma.MultiplyAdd(vm21, vy0, vdx);
var sx1 = Fma.MultiplyAdd(vm21, vy1, vdx);
var sy0 = Fma.MultiplyAdd(vm12, vx0, vdy);
var sy1 = Fma.MultiplyAdd(vm12, vx1, vdy);
vx0 = Fma.MultiplyAdd(vm11, vx0, sx0);
vx1 = Fma.MultiplyAdd(vm11, vx1, sx1);
vy0 = Fma.MultiplyAdd(vm22, vy0, sy0);
vy1 = Fma.MultiplyAdd(vm22, vy1, sy1);
Avx.Store(xn + Vector256Block0, vx0);
Avx.Store(xn + Vector256Block1, vx1);
Avx.Store(yn + Vector256Block0, vy0);
Avx.Store(yn + Vector256Block1, vy1);
}
for (; n <= N - Vector256Length; n += Vector256Length)
{
float* xn = X + n;
float* yn = Y + n;
var vx = Avx.LoadVector256(xn);
var vy = Avx.LoadVector256(yn);
var sx = Fma.MultiplyAdd(vm21, vy, vdx);
var sy = Fma.MultiplyAdd(vm12, vx, vdy);
vx = Fma.MultiplyAdd(vm11, vx, sx);
vy = Fma.MultiplyAdd(vm22, vy, sy);
Avx.Store(xn, vx);
Avx.Store(yn, vy);
}
ApplyAffineTransformOdd(N - n, X + n, Y + n, m11, m12, m21, m22, dx, dy);
}
public static unsafe void ApplyAffineTransformAvx(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
{
var vm11 = Avx.BroadcastScalarToVector256(&m11);
var vm12 = Avx.BroadcastScalarToVector256(&m12);
var vm21 = Avx.BroadcastScalarToVector256(&m21);
var vm22 = Avx.BroadcastScalarToVector256(&m22);
var vdx = Avx.BroadcastScalarToVector256(&dx);
var vdy = Avx.BroadcastScalarToVector256(&dy);
int n = 0;
for (; n <= N - Vector256Length; n += Vector256Length)
{
float* xn = X + n;
float* yn = Y + n;
var vx = Avx.LoadVector256(xn);
var vy = Avx.LoadVector256(yn);
var x1 = Avx.Multiply(vm11, vx);
var x2 = Avx.Multiply(vm21, vy);
var y1 = Avx.Multiply(vm12, vx);
var y2 = Avx.Multiply(vm22, vy);
x1 = Avx.Add(x1, x2);
y1 = Avx.Add(y1, y2);
vx = Avx.Add(x1, vdx);
vy = Avx.Add(y1, vdy);
Avx.Store(xn, vx);
Avx.Store(yn, vy);
}
ApplyAffineTransformOdd(N - n, X + n, Y + n, m11, m12, m21, m22, dx, dy);
}
public static unsafe void ApplyAffineTransformOdd(int N, float* X, float* Y, float m11, float m12, float m21, float m22, float dx, float dy)
{
for (int n = 0; n < N; n++)
{
float xn = X[n];
float yn = Y[n];
X[n] = m11 * xn + m21 * yn + dx;
Y[n] = m12 * xn + m22 * yn + dy;
}
}
}
}
Here are the benchmark results. The testing machine is Intel Core i5-7200U (Skylake architecture). The code was compiled in x64 mode with .NET 5.
Method | N | Mean | Error | StdDev | Min | Max | Median | Ratio |
---|---|---|---|---|---|---|---|---|
AffineTransformFma | 68 | 1.805 us | 0.0018 us | 0.0017 us | 1.802 us | 1.808 us | 1.805 us | 1.00 |
AffineTransformAvx | 68 | 1.152 us | 0.0158 us | 0.0140 us | 1.137 us | 1.184 us | 1.150 us | 0.64 |
AffineTransformFma | 1159 | 25.966 us | 0.1048 us | 0.0929 us | 25.843 us | 26.114 us | 25.999 us | 1.00 |
AffineTransformAvx | 1159 | 14.070 us | 0.0174 us | 0.0145 us | 14.051 us | 14.104 us | 14.066 us | 0.54 |
AffineTransformFma | 4101 | 90.094 us | 0.1041 us | 0.0974 us | 89.865 us | 90.214 us | 90.140 us | 1.00 |
AffineTransformAvx | 4101 | 48.180 us | 0.0933 us | 0.0779 us | 48.089 us | 48.320 us | 48.149 us | 0.53 |
AffineTransformFma | 16389 | 360.215 us | 0.2143 us | 0.1789 us | 359.840 us | 360.456 us | 360.266 us | 1.00 |
AffineTransformAvx | 16389 | 191.222 us | 0.3403 us | 0.3183 us | 190.660 us | 191.765 us | 191.170 us | 0.53 |
AffineTransformFma | 32773 | 725.299 us | 0.8294 us | 0.7758 us | 723.925 us | 726.415 us | 725.374 us | 1.00 |
AffineTransformAvx | 32773 | 379.920 us | 0.8381 us | 0.6999 us | 378.887 us | 381.257 us | 379.776 us | 0.52 |
Benchmark code:
using BenchmarkDotNet.Attributes;
using LinearAlgebra;
using LinearAlgebra.Tests;
namespace Benchmarks.LinearAlgebra
{
[MinColumn, MaxColumn, MedianColumn]
public unsafe class TransformBenchmark
{
[Params(68, 1159, 4101, 16389, 32773)]
public int N { get; set; }
public float[] X;
public float[] Y;
const float m11 = 1f;
const float m12 = 0f;
const float m21 = 1f;
const float m22 = 1f;
const float dx = 0f;
const float dy = 0f;
[GlobalSetup]
public void GlobalSetup()
{
X = LinAlgGenerator.GetRandomVector(N);
Y = LinAlgGenerator.GetRandomVector(N);
}
[Benchmark(Baseline = true)]
public void AffineTransformFma()
{
fixed (float* ptrX = X, ptrY = Y)
{
Level1.ApplyAffineTransformFma(N, ptrX, ptrY, m11, m12, m21, m22, dx, dy);
}
}
[Benchmark]
public void AffineTransformAvx()
{
fixed (float* ptrX = X, ptrY = Y)
{
Level1.ApplyAffineTransformAvx(N, ptrX, ptrY, m11, m12, m21, m22, dx, dy);
}
}
}
}
Which is kind of strange. Fma code is almost two times slower than avx! Based on https://software.intel.com/sites/landingpage/IntrinsicsGuide/ Fma.MultiplyAdd
method has latency=4 and throughput=0.5 which means that 4 independent operations can be executed in 4x4x0.5=8 cycles (assuming that processor has only one fma port which is of corse not true).
Fma version of the method uses two such blocks so overall performance of one iteration with loading (latency=7 and throughput=0.5) and saving (latency=5 and throughput=1) is 4x7x0.5 + 2x4x4x0.5 + 4x5x1 = 50 cycles per 16 floating-point numbers.
Avx version of the code uses 4 independent multiply operations (latency=4 and throughput=0.5) and two blocks of two independent add operations (latency=4 and throughput=0.5). So with loading and saving overall performance should be: 2x7x0.5 + 4x4x0.5 + 2x4x0.5 + 2x4x0.5 +2x5x1 = 33 cycles per 8 floating-point numbers (66 cycles per 16 floating point numbers).
Theoretically avx code should be slower than fma code. What am I doing wrong? The code is not using more than 16 256-bit registers. I also tried to test Fma version of the code without pipelining - results were the same (avx performs better).
Update It looks like that was just a bug in BenchmarkDotNet. After several launches of benchmark code that I wrote without BenchmarkDotNet framework I got the following results:
Method | N | Time | Ratio |
---|---|---|---|
AffineTransformAvx | 68 | 27.24 ns | 1.00 |
AffineTransformFma | 68 | 29.19 ns | 1.07 |
AffineTransformAvx | 1159 | 258.33 ns | 1.00 |
AffineTransformFma | 1159 | 188.40 ns | 0.73 |
AffineTransformAvx | 4101 | 818.48 ns | 1.00 |
AffineTransformFma | 4101 | 582.80 ns | 0.71 |
AffineTransformAvx | 16389 | 4,263.31 ns | 1.00 |
AffineTransformFma | 16389 | 2,959.02 ns | 0.69 |
AffineTransformAvx | 32773 | 9,782.48 ns | 1.00 |
AffineTransformFma | 32773 | 6,943.25 ns | 0.71 |
And yes - the fma version is faster by 30% in most cases! Results of benchmarks does not depends on their order (first avx and then fma or vice versa)