(Vec4 x Mat4x4) product using SIMD and improvements

Question

I am writing a complex simulation program and it apprears that the most time consumming routine is the one for multiplying a four-vector (float4) with a 4x4 matrix. I need to run this program on several computers, which are more or less old. That is why I tried to check SIMD capabilities of such operations in the following code :

//#include <xmmintrin.h> // SSE
//#include <pmmintrin.h> // SSE3
//#include <nmmintrin.h> // SSE4.2
  #include <immintrin.h> // AVX

#include <iostream>
#include <ctime>
#include <string>

using namespace std;

// 4-vector.
typedef struct
{
    float x;
    float y;
    float z;
    float w;
}float4;

// typedef to simplify the pointer of function notation.
typedef void(*Function)(float4&,const float4*,const float4&);

float dot( const float4& in_A, const float4& in_x )
{
    return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS
}

void A_times_x( float4& out_y, const float4* in_A, const float4& in_x )
{
    out_y.x = dot(in_A[0], in_x); // 7 FLOPS
    out_y.y = dot(in_A[1], in_x); // 7 FLOPS
    out_y.z = dot(in_A[2], in_x); // 7 FLOPS
    out_y.w = dot(in_A[3], in_x); // 7 FLOPS
}

void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Transpose the matrix and re-order the vector.
    _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

    __m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0));
    __m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1));
    __m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2));
    __m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS
    __m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS
    __m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS
    __m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS
    __m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m256 xx = _mm256_castps128_ps256(x);
           xx = _mm256_insertf128_ps(xx,x,1);
    __m256 A0 = _mm256_load_ps((const float*)(in_A + 0));
    __m256 A2 = _mm256_load_ps((const float*)(in_A + 2));

    // Multiply each matrix row with the vector x
    __m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS
    __m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS

  /*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0);
    __m128 sum_01 = _mm256_extractf128_ps(sum_00,1);

    __m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);*/

    // Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256)
    _mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0),
                                             _mm256_extractf128_ps(sum_00,1)));
}

void test_function ( Function f, string simd, unsigned int imax )
{
    float4 Y;
    float4 X1 = {0.5,1,0.2,0.7};
    float4 X2 = {0.7,1,0.2,0.5};
    float4 X3 = {0.5,0.2,1,0.7};
    float4 X4 = {1,0.7,0.2,0.5};
    float4 A[4] = {{0.5,1,0.2,0.7},
                   {0.6,0.4,0.1,0.8},
                   {0.3,0.8,0.2,0.5},
                   {1,0.4,0.6,0.9}};

    clock_t tstart = clock();

    for( unsigned int i=0 ; i<imax ; i++ )
    for( unsigned long int j=0 ; j<250000000 ; j++ )
    // Avoid for loop over long long, it is 2 times slower !
    {
        // Function pointer give a real call, whether the direct
        // call is inlined and thus results are overestimated.
        f( Y,A,X1 );
        f( Y,A,X2 );
        f( Y,A,X3 );
        f( Y,A,X4 );
    }

    clock_t tend = clock();

    double diff = static_cast<double>(tend - tstart) * 1e-3;

    cout << "Time  (" << simd << ") = " << diff << " s" << endl;
    cout << "Nops  (" << simd << ") = " << (double) imax << ".10^9" << endl;
    cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std.
    cout << endl;
}

int main ( int argc, char *argv[] )
{
    test_function ( &A_times_x     ,"std" , 1 );
    test_function ( &A_times_x_SSE ,"SSE" , 2 );
    test_function ( &A_times_x_SSE3,"SSE3", 3 );
    test_function ( &A_times_x_SSE4,"SSE4", 1 );
    test_function ( &A_times_x_AVX ,"AVX" , 3 );

    return 0;
}

I have some troubles about the improvements for such problem. When running the code I obtain the following results (Intel Core i5 4670K, 3.4GHz, Haswell, Codeblock+MinGW compiler using -O2 -march=corei7-avx) :

Time  (std) = 6.287 s
Nops  (std) = 1.10^9
Power (std) = 4.45363 GFLOPS

Time  (SSE) = 6.661 s
Nops  (SSE) = 2.10^9
Power (SSE) = 8.40715 GFLOPS

Time  (SSE3) = 8.361 s
Nops  (SSE3) = 3.10^9
Power (SSE3) = 10.0466 GFLOPS

Time  (SSE4) = 6.131 s
Nops  (SSE4) = 1.10^9
Power (SSE4) = 4.56695 GFLOPS

Time  (AVX) = 8.767 s
Nops  (AVX) = 3.10^9
Power (AVX) = 9.58138 GFLOPS

My questions are the following :

Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.
Why the AVX is not faster than SSE3 ?

For those who say : "stop using your stuff, use Intel Math Kernel Library", I reply : "I would not, because I want a small executable file, and I only need to use SIMD for this specific case, not elsewhere" ;-)

What compiler did you use, and what optimisation switches did you enable ? — Paul R, Jun 26 '15 at 15:40
The problem is that you spend more time shuffling data around than doing useful arithmetic. As it is a vector x matrix multiply of size 4 has too little computation to optimize in isolation. You may need to broaden the scope and consider changing your data layout to something a bit more SIMD-friendly. — Mysticial, Jun 26 '15 at 15:41
I tested your code with MSVC2013 on a Haswell, and my results are different. I set all the tests to 10x10^9, and get 11.9s for std, and 2.95s for all the others (std is exactly x4 slower). Also, the std was entirely optimized out, so I added a volatile on the results. Also, it seems that all the calls are inlined, so there is no calls, which could explain the big slow when you do actual calls. — ElderBug, Jun 26 '15 at 15:48
I have a feeling the OP is either using an old compiler or does not have optimisation enabled, as every compiler I've tried here optimises all the loops away, making the test results meaningless. — Paul R, Jun 26 '15 at 15:52
I use MinGW with Codeblock (I know ... the shame), and my compiler flags are : -Wall -O2 -march=corei7-avx. I will try with MSVC2013 tonight to check this with the Intel C++ compiler. Thanks for your useful comment ! — Asohan, Jun 26 '15 at 15:58
@PaulR Your compiler optimize out the intrinsics ? I find this very doubtful. I compiled with MSVC and GCC with -O2 both, and neither of them optimize out the instrinsics. — ElderBug, Jun 26 '15 at 16:00
@ElderBug: no - the *loops* get optimised away by current versions of g++, clang, et al, so each function only gets called once, making the timing results invalid. — Paul R, Jun 26 '15 at 16:12
Well, I tried with GCC and with Intel compiler with the same flags I used before. The GCC version gives me the same results, and the Intel version does not work properly (certainly because I am a newbie) : it seems that the loops are not done properly and it gives me 'time=0', except for the first 'standard' (std) version. Any idea ? — Asohan, Jun 26 '15 at 21:13
Try putting your `A_times_xxx` functions in a separate source file - that way the compiler will not be able to optimise away the loops (unless it does whole program optimisation). — Paul R, Jun 27 '15 at 15:00
My first post on SO [efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product: what's the point?](http://stackoverflow.com/questions/14967969/efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-prod) — Z boson, Jun 27 '15 at 20:29
If you're using MinGW (not MinGW-w64) you need to tell the compiler to use SSE instead of x87 because in 32-bit mode x87 is the default. Use `-O3 -mfpmath=sse -mavx`. — Z boson, Jun 27 '15 at 20:32
@Paul R: I tried in a separated source file, but still the same results :-/ Being not familiar with intrinsincs/optimization/compiler options, I understand that the maximum speed up of my code should be a factor 4 (8 for AVX), but I don't know how to show it with this particular example. Moreover I would like to understand what is happening to avoid to do this mistake again. — Asohan, Jun 27 '15 at 20:37
I am able to reproduce your results using `-O2`. Using `-O3` it's optimized away. — Z boson, Jun 27 '15 at 20:43
@Z boson: I already added -march=corei7-avx flags for the compilation. My problem definitely lies into the way the compiler is doing the loops. I will try to increase the complexity of the tests and see if I can reach this 'x4' speed up, and the 'x8' speed up for AVX. — Asohan, Jun 27 '15 at 20:43
Did you read my the first question I posted on SO I linked to above? — Z boson, Jun 27 '15 at 20:46
@Z boson: I avoid to use -O3 because it sometimes gives strange results. But I tried too for a test, and this is identical. The problem is not to reproduce these results, it is : why do I get too much speed up (compared to the theoretical x4) for SSE, and is the AVX function optimal for a x8 speed up ? — Asohan, Jun 27 '15 at 20:48
@Z boson: yes I have red your question, and yes I used this for my code. But my problem is not here. — Asohan, Jun 27 '15 at 20:50
@PaulR, It's optimized away with `-O3` but not `-O2` GCC 4.9.2 — Z boson, Jun 27 '15 at 20:51
I finally understand a bit more the problem. Indeed, the calls were not really done, and the register seems to return precalculated values, without doing really the computation. This explain the huge incredible speed up. I finally use the function pointer (commented in my example), and it seems to work. Nevertheless, the theoretical speed up of x4 (x8 for AVX) is not reached. I get a factor x2 for SSE, x2,2 for SSE3, x1,1 for SSE4 (because of _mm_dp_ps), and x2,1 for AVX. So, any ideas, optimizations of the functions ? Is it the maximum doable ? — Asohan, Jun 27 '15 at 21:13
Yes, don't do the transpose (`_MM_TRANSPOSE4_PS`) in the SSE version. Construct your matrix already transposed. That's the SIMD friendly way of doing it. What speed/FLOPS do you get when you comment out the line with `_MM_TRANSPOSE4_PS`? — Z boson, Jun 27 '15 at 21:18
@Z boson: I updated the example above, with the new results. It is not that bad with _MM_TRANSPOSE4_PS actually, but it is certainly because of my architecture (Haswell). I would not do this on old machines of course. I tried the direct transposition of the matrix, and it is surprinsingly a bit slower that _MM_TRANSPOSE4_PS. — Asohan, Jun 27 '15 at 21:25
It looks like you have reproduced more or less the results of my first question on SO (although I did not test AVX). If you have to transpose then hadd is the best (and dotproduct is never useful). But if you don't have to transpose then SSE is best. — Z boson, Jun 27 '15 at 21:25
@Asohan, I don't mean to do the direct transpose. I mean assume the matrix is already transposed. Just comment out the line with `_MM_TRANSPOSE4_PS` and retest. — Z boson, Jun 27 '15 at 21:27
Yes, I fully agree. Do you see any improvement for AVX ? Do you know why the speed up is limited to a factor 2-2.5 instead of 4 for SSE3 ? — Asohan, Jun 27 '15 at 21:27
@Z boson: sorry I red too fast. I agree for your idea, it is the fastest version without `_MM_TRANSPOSE4_PS`, I have a speed up factor of almost x2.5. Thank ! It will be useful since I use symmetric matrices. — Asohan, Jun 27 '15 at 21:31
Well, alignment might play a role. You'd better align everything on 16b boundary. __attribute__((aligned(16)) or __declspec(align(16) is your friend — Severin Pappadeux, Jun 27 '15 at 21:34
@SeverinPappadeux, it's already 16 byte aligned in 64-bit mode but it's probably better to explicitly require this. — Z boson, Jun 27 '15 at 21:37
@SeverinPappadeux: I tried to give explicit alignement, but the results is unchanged, but thank for the trick ! @Z boson: It is strange that SSE is faster than SSE3, especially when using `HADD` instead of `ADD`, isn't it ? — Asohan, Jun 27 '15 at 21:45
No, see the answer to my first question on SO. Horizontal operators with SIMD are not efficient. They are in some since not SIMD because they are broken down into scalar microps so they are not parallel. That's why you should avoid them. `hadd` is only helpful if you don't have the transpose. — Z boson, Jun 27 '15 at 22:13

Z boson · Accepted Answer · 2015-06-28T12:31:06.107

Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.

Yes, I explained this in detail at efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product.

The efficient method for multiplying a 4x4 matrix M with a column vector u giving v = M u is:

v = u1*col1 + u2*col2 + u3*col3 + u4*col4.

this requires that you store the column vectors. For example let's assume you have the following 4x4 matrix A:

 0  1  2  3
 4  5  6  7
 8  9 10 11
12 13 14 15

then you store this as

0 4 8 c 1 5 9 d 2 6 a e 3 7 b f

conversely if you want row vector uT times matrix M giving vT = uT*M then you want

vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.

and in this case you should pack the rows not the columns.

So to optimize your code in your function A_times_x_SSE comment out the line

 _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

and this function will be faster than your other functions using horizontal operations.

Horizontal operations with SIMD are not efficient. They are in some since not SIMD because they are broken down into scalar micro-ops so they are not parallel. They are only useful when it's inconvenient to pack your data in a form friendly for SIMD. For example when you can't store the columns of M and only have the rows.

Why the AVX is not faster than SSE3 ?

In order to do this efficiently with AVX you have to operate on two 4x4 matrices at once and also pack you matrices so that they are friendly for AVX. Now let's assume that in addition the matrix A defined above you have another matrix B:

16 17 18 19
20 21 22 23
24 25 26 27
28 29 30 31

The optimal way to pack A and B for AVX is

col1A col1B col2A col2B col3A col3B col4A col4B
0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31

Let's assume you have two vectors u = {0,1,2,3} and v = {4,5,6,7) and you want y = Au and z = Bv then with AVX you do:

c1 = col1A col1B = {0  4  8 12 16 20 24 28} = _mm256_load_ps
c2 = col2A col2B = {1  5  9 13 17 21 25 29}
c3 = col3A col3B = {2  6 10 14 18 22 26 30}
c4 = col4A col4B = {3  7 11 15 19 23 27 31}
broad1 = {0,0,0,0,4,4,4,4}
broad2 = {1,1,1,1,5,5,5,5}
broad3 = {2,2,2,2,6,6,6,6}
broad4 = {3,3,3,3,7,7,7,7}
w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4;
//w = {y1, y2, y3, y4, z1, z2, z3, z4};

So the resultant 8-wide vector w contains the 4-vectors y and z. This is the most efficient method with AVX. If you have fixed matrices and variable vectors you can run over in a loop then packing before the loop at runtime will be negligible for a large loop. If you know the matrices are fixed at compile time then you can pack at compile time.

Thanks a lot for your reply ! I see the point now. Indeed, we should avoid : to use `HADD` (and prefer `ADD`), and to do the transpose. In my case (symmetric matrices), I get a factor x2.45 of speed up using SSE1, and x2.55 using AVX (using `ADD`). What your are suggesting for AVX is the best solution, but it requires to have two 4-vectors directly as input, which I do not have unfortunately. Anyway it solves my problems and answers my questions, thanks ! — Asohan, Jun 28 '15 at 11:52
@Asohan, the AVX method does not require two 4-vectors. Just set `v = u`. In fact you could make a function `matvec(float4 *y, float* packed_matricies, float4 u)` which takes a single float4 vector as input and an array of packed matrices and returns a float4 array `y*` of matrix*vector results. The only requirement is that the number of matrices is even. But if you have an odd number then when you pack just pad one more empty matrix (or any 4x4 matrix) and ignore the last 4-vector in the array `y*`. — Z boson, Jun 28 '15 at 12:34
@Z boson: Indeed that is what I did in my example : I use `v=u` for the `x` vector and then I use only two `ADD` calls instead of 4. In the end I extract the two `__m128` from a `__mm256` and I get the final `float4`. I think it is the best we can do if we have only one matrix whitout doing empty calculations. Now I am working on the same routines for double precision calculations using `__mm256d`, I hope to achieve the same performances than `_mm128` with SSE1. — Asohan, Jun 28 '15 at 12:46

(Vec4 x Mat4x4) product using SIMD and improvements

1 Answers1