I am testing out a very simple program that is using C++ expression templates to simplify writing SSE2 and AVX code that operates on arrays of values.
I have a class svec
which represents an array of values.
I have a class sreg
that represents an SSE2 double register.
I have expr
and add_expr
representing the addition of svec
arrays.
The compiler produces three extra instructions per loop for my expression template test case compared to hand rolled code. I was wondering if there is a reason for this, or any changes I can make to get he compiler to produce the same output?
The full test harness is:
#include <iostream>
#include <emmintrin.h>
struct sreg
{
__m128d reg_;
sreg() {}
sreg(const __m128d& r) :
reg_(r)
{
}
sreg operator+(const sreg& b) const
{
return _mm_add_pd(reg_, b.reg_);
}
};
template <typename T>
struct expr
{
sreg operator[](std::size_t i) const
{
return static_cast<const T&>(*this).operator[](i);
}
operator const T&() const
{
return static_cast<const T&>(*this);
}
};
template <typename A, typename B>
struct add_expr : public expr<add_expr<A, B>>
{
const A& a_;
const B& b_;
add_expr(const A& a, const B& b) :
a_{ a }, b_{ b }
{
}
sreg operator[](std::size_t i) const
{
return a_[i] + b_[i];
}
};
template <typename A, typename B>
inline auto operator+(const expr<A>& a, const expr<B>& b)
{
return add_expr<A, B>(a, b);
}
struct svec : public expr<svec>
{
sreg* regs_;
std::size_t size_;
svec(std::size_t size) :
size_{ size }
{
regs_ = static_cast<sreg*>(_aligned_malloc(size * 32, 32));
}
~svec()
{
_aligned_free(regs_);
}
template <typename T>
svec& operator=(const T& expression)
{
for (std::size_t i = 0; i < size(); i++)
{
regs_[i] = expression[i];
}
return *this;
}
const sreg& operator[](std::size_t index) const
{
return regs_[index];
}
sreg& operator[](std::size_t index)
{
return regs_[index];
}
std::size_t size() const
{
return size_;
}
};
static constexpr std::size_t size = 64;
int main()
{
svec a(size);
svec b(size);
svec c(size);
svec d(size);
svec vec(size);
//hand rolled loop
for (std::size_t j = 0; j < size; j++)
{
vec[j] = a[j] + b[j] + c[j] + d[j];
}
//expression templates version of hand rolled loop
vec = a + b + c + d;
std::cout << "Done...";
std::getchar();
return EXIT_SUCCESS;
}
For the hand rolled loop the instructions are:
00007FF621CD1B70 mov r8,qword ptr [c]
00007FF621CD1B75 mov rdx,qword ptr [b]
00007FF621CD1B7A mov rax,qword ptr [a]
00007FF621CD1B7F vmovupd xmm0,xmmword ptr [rcx+rax]
00007FF621CD1B84 vaddpd xmm1,xmm0,xmmword ptr [rdx+rcx]
00007FF621CD1B89 vaddpd xmm3,xmm1,xmmword ptr [r8+rcx]
00007FF621CD1B8F lea rax,[rcx+rbx]
00007FF621CD1B93 vaddpd xmm1,xmm3,xmmword ptr [r10+rax]
00007FF621CD1B99 vmovupd xmmword ptr [rax],xmm1
00007FF621CD1B9D add rcx,10h
00007FF621CD1BA1 cmp rcx,400h
00007FF621CD1BA8 jb main+0C0h (07FF621CD1B70h)
For the expression templates version:
00007FF621CD1BC0 mov rdx,qword ptr [c]
00007FF621CD1BC5 mov rcx,qword ptr [rcx]
00007FF621CD1BC8 mov rax,qword ptr [r8]
00007FF621CD1BCB vmovupd xmm0,xmmword ptr [r9+rax]
00007FF621CD1BD1 vaddpd xmm1,xmm0,xmmword ptr [rcx+r9]
00007FF621CD1BD7 vaddpd xmm0,xmm1,xmmword ptr [rdx+r9]
00007FF621CD1BDD lea rax,[r9+rbx]
00007FF621CD1BE1 vaddpd xmm0,xmm0,xmmword ptr [rax+r10]
00007FF621CD1BE7 vmovupd xmmword ptr [rax],xmm0
00007FF621CD1BEB add r9,10h
00007FF621CD1BEF cmp r9,400h
00007FF621CD1BF6 jae main+154h (07FF621CD1C04h) # extra instruction 1
00007FF621CD1BF8 mov rcx,qword ptr [rsp+60h] # extra instruction 2
00007FF621CD1BFD mov r8,qword ptr [rsp+58h] # extra instruction 3
00007FF621CD1C02 jmp main+110h (07FF621CD1BC0h)
Please note this is minimum verifiable code to specifically demonstrate a problem. The code was compiled using the default Release build settings in Visual Studio 2015 Update 3.
Ideas I have discounted:
the order of the loops (I have already switched the hand rolled loop and the expression templates loop to check if the compiler still inserts the extra instructions and it does)
the compiler is optimising the hand rolled loop based on the
constexpr
size
(I have already tried test code that prevents the compiler deducing thatsize
is constant to better optimise the hand rolled loop and it makes no difference to the hand rolled loop's instructions).