I try to speed up calculation of average of 4d vectors placed in an array. Here is my code:
#include <sys/time.h>
#include <sys/param.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <xmmintrin.h>
typedef float dot[4];
#define N 1000000
double gettime ()
{
struct timeval tv;
gettimeofday (&tv, 0);
return (double)tv.tv_sec + (0.000001 * (double)tv.tv_usec);
}
void calc_avg1 (dot res, const dot array[], int n)
{
int i,j;
memset (res, 0, sizeof (dot));
for (i = 0; i < n; i++)
{
for (j = 0; j<4; j++) res[j] += array[i][j];
}
for (j = 0; j<4; j++) res[j] /= n;
}
void calc_avg2 (dot res, const dot array[], int n)
{
int i;
__v4sf r = _mm_set1_ps (0.0);
for (i=0; i<n; i++) r += _mm_load_ps (array[i]);
r /= _mm_set1_ps ((float)n);
_mm_store_ps (res, r);
}
int main ()
{
void *space = malloc (N*sizeof(dot)+15);
dot *array = (dot*)(((unsigned long)space+15) & ~(unsigned long)15);
dot avg __attribute__((aligned(16)));
int i;
double time;
for (i = 0; i < N; i++)
{
array[i][0] = 1.0*random();
array[i][1] = 1.0*random();
array[i][2] = 1.0*random();
}
time = gettime();
calc_avg1 (avg, array, N);
time = gettime() - time;
printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);
time = gettime();
calc_avg2 (avg, array, N);
time = gettime() - time;
printf ("%f\n%f %f %f\n", time, avg[0], avg[1], avg[2]);
return 0;
}
So as you can see calc_avg1
uses naive loops from 0 to 4 and calc_avg2
replaces them with SSE instructions. I compile this code with clang 3.4:
cc -O2 -o test test.c
Here is disassemble of calc_avgX functions:
0000000000400860 <calc_avg1>:
400860: 55 push %rbp
400861: 48 89 e5 mov %rsp,%rbp
400864: 85 d2 test %edx,%edx
400866: 0f 57 c0 xorps %xmm0,%xmm0
400869: 0f 11 07 movups %xmm0,(%rdi)
40086c: 7e 42 jle 4008b0 <calc_avg1+0x50>
40086e: 48 83 c6 0c add $0xc,%rsi
400872: 0f 57 c0 xorps %xmm0,%xmm0
400875: 89 d0 mov %edx,%eax
400877: 0f 57 c9 xorps %xmm1,%xmm1
40087a: 0f 57 d2 xorps %xmm2,%xmm2
40087d: 0f 57 db xorps %xmm3,%xmm3
400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3
400885: f3 0f 11 1f movss %xmm3,(%rdi)
400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2
40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1
400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
40089d: f3 0f 58 06 addss (%rsi),%xmm0
4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008a6: 48 83 c6 10 add $0x10,%rsi
4008aa: ff c8 dec %eax
4008ac: 75 d2 jne 400880 <calc_avg1+0x20>
4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c>
4008b0: 0f 57 c0 xorps %xmm0,%xmm0
4008b3: 0f 57 c9 xorps %xmm1,%xmm1
4008b6: 0f 57 d2 xorps %xmm2,%xmm2
4008b9: 0f 57 db xorps %xmm3,%xmm3
4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4
4008c0: f3 0f 5e dc divss %xmm4,%xmm3
4008c4: f3 0f 11 1f movss %xmm3,(%rdi)
4008c8: f3 0f 5e d4 divss %xmm4,%xmm2
4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
4008d1: f3 0f 5e cc divss %xmm4,%xmm1
4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
4008da: f3 0f 5e c4 divss %xmm4,%xmm0
4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008e3: 5d pop %rbp
4008e4: c3 retq
4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1)
4008ec: 00 00 00 00
00000000004008f0 <calc_avg2>:
4008f0: 55 push %rbp
4008f1: 48 89 e5 mov %rsp,%rbp
4008f4: 85 d2 test %edx,%edx
4008f6: 0f 57 c0 xorps %xmm0,%xmm0
4008f9: 7e 10 jle 40090b <calc_avg2+0x1b>
4008fb: 89 d0 mov %edx,%eax
4008fd: 0f 1f 00 nopl (%rax)
400900: 0f 58 06 addps (%rsi),%xmm0
400903: 48 83 c6 10 add $0x10,%rsi
400907: ff c8 dec %eax
400909: 75 f5 jne 400900 <calc_avg2+0x10>
40090b: 66 0f 6e ca movd %edx,%xmm1
40090f: 66 0f 70 c9 00 pshufd $0x0,%xmm1,%xmm1
400914: 0f 5b c9 cvtdq2ps %xmm1,%xmm1
400917: 0f 5e c1 divps %xmm1,%xmm0
40091a: 0f 29 07 movaps %xmm0,(%rdi)
40091d: 5d pop %rbp
40091e: c3 retq
40091f: 90 nop
And here is the result:
> ./test
0.004287
1073864320.000000 1074018048.000000 1073044224.000000
0.003661
1073864320.000000 1074018048.000000 1073044224.000000
So SSE version is 1.17 times faster. But when I try to do seemingly the same job, which is to calculate average of single precision scalars in an array (as described here SSE reduction of float vector for example), SSE version runs 3.32 times faster. Here is the code of calc_avgX functions:
float calc_avg1 (const float array[], int n)
{
int i;
float avg = 0;
for (i = 0; i < n; i++) avg += array[i];
return avg / n;
}
float calc_avg3 (const float array[], int n)
{
int i;
__v4sf r = _mm_set1_ps (0.0);
for (i=0; i<n; i+=4) r += _mm_load_ps (&(array[i]));
r = _mm_hadd_ps (r, r);
r = _mm_hadd_ps (r, r);
return r[0] / n;
}
So my question is this: Why I benefit from SSE so much in the last example (calculation of average of single float scalars) and do not in the first (calculation of average of 4d vectors)? For me, these jobs are almost the same. What is the right way to speed up calculations in the first example, if I do it wrong?
UPD: If you think it's relevant, I also supply disassembly of the second example, where average of scalars is calculated (also compiled with clang3.4 -O2).
0000000000400860 <calc_avg1>:
400860: 55 push %rbp
400861: 48 89 e5 mov %rsp,%rbp
400864: 85 d2 test %edx,%edx
400866: 0f 57 c0 xorps %xmm0,%xmm0
400869: 0f 11 07 movups %xmm0,(%rdi)
40086c: 7e 42 jle 4008b0 <calc_avg1+0x50>
40086e: 48 83 c6 0c add $0xc,%rsi
400872: 0f 57 c0 xorps %xmm0,%xmm0
400875: 89 d0 mov %edx,%eax
400877: 0f 57 c9 xorps %xmm1,%xmm1
40087a: 0f 57 d2 xorps %xmm2,%xmm2
40087d: 0f 57 db xorps %xmm3,%xmm3
400880: f3 0f 58 5e f4 addss -0xc(%rsi),%xmm3
400885: f3 0f 11 1f movss %xmm3,(%rdi)
400889: f3 0f 58 56 f8 addss -0x8(%rsi),%xmm2
40088e: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
400893: f3 0f 58 4e fc addss -0x4(%rsi),%xmm1
400898: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
40089d: f3 0f 58 06 addss (%rsi),%xmm0
4008a1: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008a6: 48 83 c6 10 add $0x10,%rsi
4008aa: ff c8 dec %eax
4008ac: 75 d2 jne 400880 <calc_avg1+0x20>
4008ae: eb 0c jmp 4008bc <calc_avg1+0x5c>
4008b0: 0f 57 c0 xorps %xmm0,%xmm0
4008b3: 0f 57 c9 xorps %xmm1,%xmm1
4008b6: 0f 57 d2 xorps %xmm2,%xmm2
4008b9: 0f 57 db xorps %xmm3,%xmm3
4008bc: f3 0f 2a e2 cvtsi2ss %edx,%xmm4
4008c0: f3 0f 5e dc divss %xmm4,%xmm3
4008c4: f3 0f 11 1f movss %xmm3,(%rdi)
4008c8: f3 0f 5e d4 divss %xmm4,%xmm2
4008cc: f3 0f 11 57 04 movss %xmm2,0x4(%rdi)
4008d1: f3 0f 5e cc divss %xmm4,%xmm1
4008d5: f3 0f 11 4f 08 movss %xmm1,0x8(%rdi)
4008da: f3 0f 5e c4 divss %xmm4,%xmm0
4008de: f3 0f 11 47 0c movss %xmm0,0xc(%rdi)
4008e3: 5d pop %rbp
4008e4: c3 retq
4008e5: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1)
4008ec: 00 00 00 00
00000000004008d0 <calc_avg3>:
4008d0: 55 push %rbp
4008d1: 48 89 e5 mov %rsp,%rbp
4008d4: 31 c0 xor %eax,%eax
4008d6: 85 f6 test %esi,%esi
4008d8: 0f 57 c0 xorps %xmm0,%xmm0
4008db: 7e 0f jle 4008ec <calc_avg3+0x1c>
4008dd: 0f 1f 00 nopl (%rax)
4008e0: 0f 58 04 87 addps (%rdi,%rax,4),%xmm0
4008e4: 48 83 c0 04 add $0x4,%rax
4008e8: 39 f0 cmp %esi,%eax
4008ea: 7c f4 jl 4008e0 <calc_avg3+0x10>
4008ec: 66 0f 70 c8 01 pshufd $0x1,%xmm0,%xmm1
4008f1: f3 0f 58 c8 addss %xmm0,%xmm1
4008f5: 66 0f 70 d0 03 pshufd $0x3,%xmm0,%xmm2
4008fa: 0f 12 c0 movhlps %xmm0,%xmm0
4008fd: f3 0f 58 c1 addss %xmm1,%xmm0
400901: f3 0f 58 c2 addss %xmm2,%xmm0
400905: 0f 57 c9 xorps %xmm1,%xmm1
400908: f3 0f 2a ce cvtsi2ss %esi,%xmm1
40090c: f3 0f 5e c1 divss %xmm1,%xmm0
400910: 5d pop %rbp
400911: c3 retq
400912: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1)
400919: 1f 84 00 00 00 00 00