I've created the following test method to understand how SSE and AVX work and what their benefits are. Now I'm actually very surprised to see that System.Runtime.Intrinsics.X86.Avx.Multiply
is less than 5% faster compared to the traditional approach with the *
operator.
I don't understand why this is. Would you please enlighten me?
I've put my benchmark results in the last line of the code examples.
(long TicksSse2, long TicksAlu) TestFloat()
{
Vector256<float> x = Vector256.Create((float)255, (float)128, (float)64, (float)32, (float)16, (float)8, (float)4, (float)2);
Vector256<float> y = Vector256.Create((float).5);
Stopwatch timerSse = new Stopwatch();
Stopwatch timerAlu = new Stopwatch();
for (int cnt = 0; cnt < 100_000_000; cnt++)
{
timerSse.Start();
var xx = Avx.Multiply(x, y);
timerSse.Stop();
timerAlu.Start();
float a = (float)255 * (float).5;
float b = (float)128 * (float).5;
float c = (float)64 * (float).5;
float d = (float)32 * (float).5;
float e = (float)16 * (float).5;
float f = (float)8 * (float).5;
float g = (float)4 * (float).5;
float h = (float)2 * (float).5;
timerAlu.Stop();
}
return (timerSse.ElapsedMilliseconds, timerAlu.ElapsedMilliseconds);
// timerSse = 1688ms; timerAlu = 1748ms.
}
Even more drastically, I created the following test method for mass byte multiplication. This one is even slower using the SSE commands:
Vector128<byte> MultiplyBytes(Vector128<byte> x, Vector128<byte> y)
{
Vector128<ushort> xAsUShort = x.AsUInt16();
Vector128<ushort> yAsUShort = y.AsUInt16();
Vector128<ushort> dstEven = Sse2.MultiplyLow(xAsUShort, yAsUShort);
Vector128<ushort> dstOdd = Sse2.MultiplyLow(Sse2.ShiftRightLogical(xAsUShort, 8), Sse2.ShiftRightLogical(yAsUShort, 8));
return Sse2.Or(Sse2.ShiftLeftLogical(dstOdd, 8), Sse2.And(dstEven, helper)).AsByte();
}
(long TicksSse2, long TicksAlu) TestBytes()
{
Vector128<byte> x = Vector128.Create((byte)1, (byte)2, (byte)3, (byte)4, (byte)5, (byte)6, (byte)7, (byte)8, (byte)9, (byte)10, (byte)11, (byte)12, (byte)13, (byte)14, (byte)15, (byte)16);
Vector128<byte> y = Vector128.Create((byte)2);
Stopwatch timerSse = new Stopwatch();
Stopwatch timerAlu = new Stopwatch();
for (int cnt = 0; cnt < 100_000_000; cnt++)
{
timerSse.Start();
var xx = MultiplyBytes(x, y);
timerSse.Stop();
timerAlu.Start();
byte a = (byte)1 * (byte)2;
byte b = (byte)2 * (byte)2;
byte c = (byte)3 * (byte)2;
byte d = (byte)4 * (byte)2;
byte e = (byte)5 * (byte)2;
byte f = (byte)6 * (byte)2;
byte g = (byte)7 * (byte)2;
byte h = (byte)8 * (byte)2;
byte i = (byte)9 * (byte)2;
byte j = (byte)10 * (byte)2;
byte k = (byte)11 * (byte)2;
byte l = (byte)12 * (byte)2;
byte m = (byte)13 * (byte)2;
byte n = (byte)14 * (byte)2;
byte o = (byte)15 * (byte)2;
byte p = (byte)16 * (byte)2;
timerAlu.Stop();
}
return (timerSse.ElapsedMilliseconds, timerAlu.ElapsedMilliseconds);
// timerSse = 3439ms; timerAlu = 1800ms
}