I am messing around with SIMD optimization and wrote a 3 very simple vector classes with addition implemented in 2 different ways, one handwritten component wise and one using _mm_add_ps https://godbolt.org/z/fPAERV. Interestingly GCC was not able ( or I didn't tell it properly x) ) to implement the addition for vector2 using SSE, only after explicity adding a fourth float to the vector ( like in vector3 ) gcc generates the addition using SEE instructions even though I aligned the vector on a 16 byte boundary. Can anyone tell me why?
#include <xmmintrin.h>
struct alignas(16) vector final {
union {
struct {
float x, y, z;
};
float axes[3];
__m128 v;
};
vector(float x, float y, float z) noexcept : x(x), y(y), z(z) {};
vector(__m128 v) noexcept : v(v){};
};
vector operator+(const vector& v0, const vector& v1) noexcept {
return {_mm_add_ps(v0.v, v1.v)};
}
struct alignas(16) vector2 final {
union {
struct {
float x, y, z;
};
float axes[3];
__m128 v;
};
vector2(float x, float y, float z) noexcept : x(x), y(y), z(z) {};
vector2(__m128 v) noexcept : v(v){};
};
vector2 operator+(const vector2& v0, const vector2& v1) noexcept {
return {v0.x + v1.x, v0.y + v1.y, v0.z + v1.z};
}
struct alignas(16) vector3 final {
union {
struct {
float x, y, z, w;
};
float axes[4];
__m128 v;
};
vector3(float x, float y, float z, float w) noexcept : x(x), y(y), z(z), w(w) {};
vector3(__m128 v) noexcept : v(v){};
};
vector3 operator+(const vector3& v0, const vector3& v1) noexcept {
return {v0.x + v1.x, v0.y + v1.y, v0.z + v1.z, v0.w + v1.w};
}
generated assembly using gcc9.2 with -std=c++17 -O3 -Wall -Wextra
operator+(vector const&, vector const&):
movaps xmm1, XMMWORD PTR [rsi]
addps xmm1, XMMWORD PTR [rdi]
movdqa xmm0, xmm1
movaps XMMWORD PTR [rsp-24], xmm1
movq xmm1, QWORD PTR [rsp-16]
ret
operator+(vector2 const&, vector2 const&):
movss xmm1, DWORD PTR [rdi+4]
movss xmm0, DWORD PTR [rdi+8]
addss xmm1, DWORD PTR [rsi+4]
addss xmm0, DWORD PTR [rsi+8]
movss xmm2, DWORD PTR [rdi]
addss xmm2, DWORD PTR [rsi]
movss DWORD PTR [rsp-20], xmm1
movss DWORD PTR [rsp-16], xmm0
movq xmm1, QWORD PTR [rsp-16]
movss DWORD PTR [rsp-24], xmm2
movq xmm0, QWORD PTR [rsp-24]
ret
operator+(vector3 const&, vector3 const&):
movaps xmm0, XMMWORD PTR [rdi]
addps xmm0, XMMWORD PTR [rsi]
movaps XMMWORD PTR [rsp-40], xmm0
mov rax, QWORD PTR [rsp-32]
movq xmm0, QWORD PTR [rsp-40]
movq xmm1, rax
mov QWORD PTR [rsp-16], rax
ret