I have a piece of code that is being run under a heavily contended lock, so it needs to be as fast as possible. The code is very simple - it's a basic multiply-add on a bunch of data which looks like this:
for( int i = 0; i < size; i++ )
{
c[i] += (double)a[i] * (double)b[i];
}
Under -O3 with enabled SSE support the code is being vectorized as I would expect it to be. However, with AVX code generation turned on I get about 10-15% slowdown instead of speedup, and I can't figure out why.
Here's the benchmark code:
#include <chrono>
#include <cstdio>
#include <cstdlib>
int main()
{
int size = 1 << 20;
float *a = new float[size];
float *b = new float[size];
double *c = new double[size];
for (int i = 0; i < size; i++)
{
a[i] = rand();
b[i] = rand();
c[i] = rand();
}
for (int j = 0; j < 10; j++)
{
auto begin = std::chrono::high_resolution_clock::now();
for( int i = 0; i < size; i++ )
{
c[i] += (double)a[i] * (double)b[i];
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
printf("%lluus\n", duration);
}
}
Here's the generated assembly under SSE:
0x100007340 <+144>: cvtps2pd (%r13,%rbx,4), %xmm0
0x100007346 <+150>: cvtps2pd 0x8(%r13,%rbx,4), %xmm1
0x10000734c <+156>: cvtps2pd (%r15,%rbx,4), %xmm2
0x100007351 <+161>: mulpd %xmm0, %xmm2
0x100007355 <+165>: cvtps2pd 0x8(%r15,%rbx,4), %xmm0
0x10000735b <+171>: mulpd %xmm1, %xmm0
0x10000735f <+175>: movupd (%r14,%rbx,8), %xmm1
0x100007365 <+181>: addpd %xmm2, %xmm1
0x100007369 <+185>: movupd 0x10(%r14,%rbx,8), %xmm2
0x100007370 <+192>: addpd %xmm0, %xmm2
0x100007374 <+196>: movupd %xmm1, (%r14,%rbx,8)
0x10000737a <+202>: movupd %xmm2, 0x10(%r14,%rbx,8)
0x100007381 <+209>: addq $0x4, %rbx
0x100007385 <+213>: cmpq $0x100000, %rbx ; imm = 0x100000
0x10000738c <+220>: jne 0x100007340 ; <+144> at main.cpp:26:20
Results from running SSE benchmark:
1411us
1246us
1243us
1267us
1242us
1237us
1246us
1242us
1250us
1229us
Generated assembly with AVX enabled:
0x1000070b0 <+144>: vcvtps2pd (%r13,%rbx,4), %ymm0
0x1000070b7 <+151>: vcvtps2pd 0x10(%r13,%rbx,4), %ymm1
0x1000070be <+158>: vcvtps2pd 0x20(%r13,%rbx,4), %ymm2
0x1000070c5 <+165>: vcvtps2pd 0x30(%r13,%rbx,4), %ymm3
0x1000070cc <+172>: vcvtps2pd (%r15,%rbx,4), %ymm4
0x1000070d2 <+178>: vmulpd %ymm4, %ymm0, %ymm0
0x1000070d6 <+182>: vcvtps2pd 0x10(%r15,%rbx,4), %ymm4
0x1000070dd <+189>: vmulpd %ymm4, %ymm1, %ymm1
0x1000070e1 <+193>: vcvtps2pd 0x20(%r15,%rbx,4), %ymm4
0x1000070e8 <+200>: vcvtps2pd 0x30(%r15,%rbx,4), %ymm5
0x1000070ef <+207>: vmulpd %ymm4, %ymm2, %ymm2
0x1000070f3 <+211>: vmulpd %ymm5, %ymm3, %ymm3
0x1000070f7 <+215>: vaddpd (%r14,%rbx,8), %ymm0, %ymm0
0x1000070fd <+221>: vaddpd 0x20(%r14,%rbx,8), %ymm1, %ymm1
0x100007104 <+228>: vaddpd 0x40(%r14,%rbx,8), %ymm2, %ymm2
0x10000710b <+235>: vaddpd 0x60(%r14,%rbx,8), %ymm3, %ymm3
0x100007112 <+242>: vmovupd %ymm0, (%r14,%rbx,8)
0x100007118 <+248>: vmovupd %ymm1, 0x20(%r14,%rbx,8)
0x10000711f <+255>: vmovupd %ymm2, 0x40(%r14,%rbx,8)
0x100007126 <+262>: vmovupd %ymm3, 0x60(%r14,%rbx,8)
0x10000712d <+269>: addq $0x10, %rbx
0x100007131 <+273>: cmpq $0x100000, %rbx ; imm = 0x100000
0x100007138 <+280>: jne 0x1000070b0 ; <+144> at main.cpp:26:20
Results from running AVX benchmark:
1532us
1404us
1480us
1464us
1410us
1383us
1333us
1362us
1494us
1526us
Note that AVX code being generated with twice as many instructions as SSE doesn't really matter - I've tried smaller unroll by hand (to match SSE) and AVX was still slower.
For context, I'm using macOS 11 and Xcode 12, with Mac Pro 6.1 (trashcan) with Intel Xeon CPU E5-1650 v2 @ 3.50GHz.