I am not an assembly expert but I have managed following. I would have commented but it is too big:
cat test.s
.file "test.c"
.text
.p2align 4,,15
.globl f
.type f, @function
f:
.LFB0:
.cfi_startproc
testl %esi, %esi
jle .L4
leal -1(%rsi), %eax
pxor %xmm0, %xmm0
movss .LC1(%rip), %xmm1
leaq 8(%rdi,%rax,8), %rax
.p2align 4,,10
.p2align 3
.L3:
movaps %xmm1, %xmm4
movss (%rdi), %xmm3
movss 4(%rdi), %xmm2
mulss %xmm3, %xmm1
mulss %xmm2, %xmm4
addq $8, %rdi
mulss %xmm0, %xmm2
cmpq %rdi, %rax
mulss %xmm3, %xmm0
subss %xmm2, %xmm1
addss %xmm4, %xmm0
jne .L3
.L1:
movss %xmm1, -8(%rsp)
movss %xmm0, -4(%rsp)
movq -8(%rsp), %xmm0
ret
.L4:
movss .LC1(%rip), %xmm1
pxor %xmm0, %xmm0
jmp .L1
.cfi_endproc
.LFE0:
.size f, .-f
.section .rodata.cst4,"aM",@progbits,4
.align 4
.LC1:
.long 1065353216
.ident "GCC: (Ubuntu 6.2.0-5ubuntu12) 6.2.0 20161005"
.section .note.GNU-stack,"",@progbits
My compilation command was gcc -S -O3 -ffast-math -ftree-vectorizer-verbose=3 -ftree-slp-vectorize -ftree-vectorize -msse3 test.c
you do not need all of them as few gets enabled at -O3. Refer to https://gcc.gnu.org/projects/tree-ssa/vectorization.html
While I do not have an answer I have tried to help.
When I specify my cpu architecture(build) as well I get following:
.file "test.c"
.text
.p2align 4,,15
.globl f
.type f, @function
f:
.LFB0:
.cfi_startproc
testl %esi, %esi
jle .L4
vmovss .LC1(%rip), %xmm1
leal -1(%rsi), %eax
vxorps %xmm0, %xmm0, %xmm0
leaq 8(%rdi,%rax,8), %rax
.p2align 4,,10
.p2align 3
.L3:
vmovss (%rdi), %xmm2
vmovss 4(%rdi), %xmm3
addq $8, %rdi
vmulss %xmm3, %xmm0, %xmm4
vmulss %xmm2, %xmm0, %xmm0
vfmadd231ss %xmm3, %xmm1, %xmm0
vfmsub132ss %xmm2, %xmm4, %xmm1
cmpq %rdi, %rax
jne .L3
.L1:
vmovss %xmm1, -8(%rsp)
vmovss %xmm0, -4(%rsp)
vmovq -8(%rsp), %xmm0
ret
.L4:
vmovss .LC1(%rip), %xmm1
vxorps %xmm0, %xmm0, %xmm0
jmp .L1
.cfi_endproc
.LFE0:
.size f, .-f
.section .rodata.cst4,"aM",@progbits,4
.align 4
.LC1:
.long 1065353216
.ident "GCC: (Ubuntu 6.2.0-5ubuntu12) 6.2.0 20161005"
.section .note.GNU-stack,"",@progbits
The command now is gcc -S -O3 -ffast-math -msse4 -march=haswell test.c
where haswell is my i7 4770HQ cpu. Refer this for your cpu.
So as you see the AVX instruction set come in picture in the second version.
A sample benchmark for following code:
$time ./a.out
0.000000
real 0m0.684s
user 0m0.620s
sys 0m0.060s
#include <stdio.h>
#include <complex.h>
complex float f(complex float x[], long n ) {
complex float p = 1.0;
for (long i = 0; i < n; i++)
p *= x[i];
return p;
}
int main()
{
static complex float x[200000000] = {0.0, 1.0, 2.0, 4.0, 5.0, 6.0};
complex float p = f(x, 200000000);
printf("%f", creal(p));
return 0;
}
The array is static so most of it is on disk i.e. ssd hard drive. You can allocate it in memory for even faster processing. This is 200M loops. Binary is 1.5G so most of the time is IO. CPU is blazing it even without -msse3 and -march. All you need is -ffast-math. That is causing a big difference.
I changed the program to following:
#include <stdio.h>
#include <complex.h>
float f(float x[], long n ) {
float p = 1.0;
for (long i = 0; i < 8; i++) {
p = p * x[i];
}
return p;
}
int main() {
float x[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
printf("%f\n", f(x, 8));
return 0;
}
and compiled with gcc -S -O3 -ffast-math -msse3 -mfpmath=sse -mavx -march=haswell test.c
which results in:
f:
.LFB23:
.cfi_startproc
vmovups (%rdi), %ymm2
vxorps %xmm1, %xmm1, %xmm1
vperm2f128 $33, %ymm1, %ymm2, %ymm0
vmulps %ymm2, %ymm0, %ymm0
vperm2f128 $33, %ymm1, %ymm0, %ymm2
vshufps $78, %ymm2, %ymm0, %ymm2
vmulps %ymm2, %ymm0, %ymm0
vperm2f128 $33, %ymm1, %ymm0, %ymm1
vpalignr $4, %ymm0, %ymm1, %ymm1
vmulps %ymm1, %ymm0, %ymm0
vzeroupper
ret
.cfi_endproc
So what appears to me is that to force gcc to use SSE3 you node to code in a certain way. http://sci.tuomastonteri.fi/programming/sse will be useful to you.
Final notes: If you experiment with different values of upper limit for i you will see that different instructions are produced. I think the reason for this is that gcc does not evaluate variable so you might want to use C++ templates which are capable of compile time calculations and do it.