I wrote some code to test/play around comparing regular C++ code with SSE intrinsics. What I noticed is that both sections of the code shown below run at similar times, usually with a difference of 5-10%. Naïvely, I'd expect something more noticeable.
I post the code here. Below the code, you may find some interesting sections of the disassembled code, which just added to my confusion.
// main.cpp
#include <chrono>
#include <vector>
#include <complex>
#include <iostream>
#include <immintrin.h>
using cdouble_sse = __m128d;
using cdouble = std::complex<double>;
cdouble_sse prod_sse(cdouble_sse val, cdouble_sse other);
cdouble prod(cdouble a, cdouble b);
int main() {
int constexpr N = 10000;
int constexpr N2 = 1000000;
cdouble *v1 = (cdouble*)_mm_malloc(N*sizeof(cdouble), alignof(cdouble_sse));
cdouble *v2 = (cdouble*)_mm_malloc(N*sizeof(cdouble), alignof(cdouble_sse));
for (int i = 0; i < N; ++i) {
v1[i] = cdouble{drand48(), drand48()};
v2[i] = cdouble{drand48(), drand48()};
}
{
double const div = 1.0 / N;
cdouble out = 0.0;
auto start = std::chrono::steady_clock::now();
for (int t = 0; t < N2; ++t) {
cdouble out_tmp = 0.0;
cdouble *p1 = v1;
cdouble *p2 = v2;
cdouble *end = p1 + N;
for (; p1 != end; ++p1, ++p2)
out_tmp += prod(*p1, *p2);
out += out_tmp * div;
}
auto end = std::chrono::steady_clock::now();
std::cout << out.real() << '\t' << out.imag() << '\n';
std::chrono::duration<double> elapsed = end - start;
std::cout << elapsed.count();
std::cout << '\n';
}
{
cdouble_sse div{1.0 / N, 1.0 / N};
cdouble_sse out{0.0, 0.0};
auto start = std::chrono::steady_clock::now();
for (int t = 0; t < N2; ++t) {
cdouble_sse out_tmp{0.0, 0.0};
cdouble_sse *p1 = reinterpret_cast<cdouble_sse*>(v1);
cdouble_sse *p2 = reinterpret_cast<cdouble_sse*>(v2);
cdouble_sse *end = p1 + N;
for (; p1 != end; ++p1, ++p2) {
cdouble_sse tmp = prod_sse(*p1, *p2);
out_tmp = _mm_add_pd(out_tmp, tmp);
}
out = _mm_add_pd(out, _mm_mul_pd(out_tmp, div));
}
auto end = std::chrono::steady_clock::now();
cdouble res = cdouble{out[0], out[1]};
std::cout << res.real() << '\t' << res.imag() << '\n';
std::chrono::duration<double> elapsed = end - start;
std::cout << elapsed.count();
std::cout << '\n';
}
_mm_free(v1);
_mm_free(v2);
return 0;
}
// prods.cpp
cdouble_sse prod_sse(cdouble_sse val, cdouble_sse other) {
auto const x1 = _mm_shuffle_pd(val, val, 0);
auto const y1 = _mm_shuffle_pd(val, val, 3);
auto const y2 = _mm_shuffle_pd(other, other, 1);
auto const z1 = _mm_mul_pd(y1, y2);
return cdouble_sse{_mm_fmsubadd_pd(x1, other, z1)};
}
cdouble prod(cdouble a, cdouble b) { return std::conj(a) * b; }
ARGS=--std=c++20 -march=native -O2
all: main
prods.o: prods.cpp Makefile
g++ $(ARGS) prods.cpp -c
main: main.cpp prods.o
g++ $(ARGS) main.cpp -c
g++ $(ARGS) main.o prods.o -o main
objdump -Cd main > dump_main
Here are the sections from the disassembled code that call the prod
and prod_sse
functions:
11c0: c4 c1 7b 10 07 vmovsd (%r15),%xmm0
11c5: c4 c1 7b 10 4f 08 vmovsd 0x8(%r15),%xmm1
11cb: c5 fb 10 13 vmovsd (%rbx),%xmm2
11cf: c5 fb 10 5b 08 vmovsd 0x8(%rbx),%xmm3
11d4: 48 83 c3 10 add $0x10,%rbx
11d8: e8 63 03 00 00 call 1540 <prod(std::complex<double>, std::complex<double>)>
11dd: c4 e1 f9 6e ed vmovq %rbp,%xmm5
11e2: c5 d3 58 e0 vaddsd %xmm0,%xmm5,%xmm4
11e6: c5 f3 58 34 24 vaddsd (%rsp),%xmm1,%xmm6
11eb: 49 83 c7 10 add $0x10,%r15
11ef: c4 e1 f9 7e e5 vmovq %xmm4,%rbp
11f4: c5 fb 11 34 24 vmovsd %xmm6,(%rsp)
11f9: 4c 39 f3 cmp %r14,%rbx
11fc: 75 c2 jne 11c0 <main+0xf0>
...
12d8: c4 c1 79 28 0c 1c vmovapd (%r12,%rbx,1),%xmm1
12de: c4 c1 79 28 44 1d 00 vmovapd 0x0(%r13,%rbx,1),%xmm0
12e5: 48 83 c3 10 add $0x10,%rbx
12e9: e8 32 02 00 00 call 1520 <prod_sse(double __vector(2), double __vector(2))>
12ee: c5 f9 58 14 24 vaddpd (%rsp),%xmm0,%xmm2
12f3: c5 f9 29 14 24 vmovapd %xmm2,(%rsp)
12f8: 48 81 fb 00 71 02 00 cmp $0x27100,%rbx
12ff: 75 d7 jne 12d8 <main+0x208>
The non-SSE version is using twice as many mov
's as the SSE version, which makes sense.
You can also see below the disassembled code for both product functions:
0000000000001520 <prod_sse(double __vector(2), double __vector(2))>:
1520: c4 e3 79 05 d0 00 vpermilpd $0x0,%xmm0,%xmm2
1526: c4 e3 79 05 d9 01 vpermilpd $0x1,%xmm1,%xmm3
152c: c4 e3 79 05 c0 03 vpermilpd $0x3,%xmm0,%xmm0
1532: c5 f9 59 c3 vmulpd %xmm3,%xmm0,%xmm0
1536: c4 e2 e9 b7 c1 vfmsubadd231pd %xmm1,%xmm2,%xmm0
153b: c3 ret
153c: 0f 1f 40 00 nopl 0x0(%rax)
0000000000001540 <prod(std::complex<double>, std::complex<double>)>:
1540: c5 e3 10 eb vmovsd %xmm3,%xmm3,%xmm5
1544: c5 f1 57 1d d4 0a 00 vxorpd 0xad4(%rip),%xmm1,%xmm3 # 2020 <_IO_stdin_used+0x20>
154b: 00
154c: c5 eb 10 e2 vmovsd %xmm2,%xmm2,%xmm4
1550: c5 fb 10 f0 vmovsd %xmm0,%xmm0,%xmm6
1554: c5 fb 59 d5 vmulsd %xmm5,%xmm0,%xmm2
1558: c5 e3 59 c5 vmulsd %xmm5,%xmm3,%xmm0
155c: c4 e2 e9 9d cc vfnmadd132sd %xmm4,%xmm2,%xmm1
1561: c4 e2 c9 bb c4 vfmsub231sd %xmm4,%xmm6,%xmm0
1566: c5 f9 2e c1 vucomisd %xmm1,%xmm0
156a: 7a 01 jp 156d <prod(std::complex<double>, std::complex<double>)+0x2d>
156c: c3 ret
156d: 50 push %rax
156e: c5 d3 10 cd vmovsd %xmm5,%xmm5,%xmm1
1572: c5 db 10 c4 vmovsd %xmm4,%xmm4,%xmm0
1576: c5 cb 10 d6 vmovsd %xmm6,%xmm6,%xmm2
157a: e8 b1 fa ff ff call 1030 <__muldc3@plt>
157f: 5a pop %rdx
1580: c3 ret
The code for prod
performs more operations than that of prod_sse
, since it deals with real and imaginary parts separately.
Can anyone explain to me why the SSE version is only 5-10% faster than the "regular" version?
Edit: even if I compile with the -flto
flag, the time difference between SSE and non-SSE codes is still around 5-10%.