I wrote a little function, just for example, of sum two arrays of float numbers using cted to SSE SIMD vector oerations in C++, but i didn't expected to see the same result in Go lang assembly? What is wrong?
Common pseudocode looks like:
a := [N]float{}
b := [N]float{}
sum := 0
for i < N do
sum += a[i] + b[i]
endfor
vectorize SSE SIMD C++:
#include <iostream>
#include <intrin.h>
using namespace std;
float sum(const float* a, const float* b, unsigned int len) {
__m128 sum = _mm_set1_ps(0);
for (int i = 0; i < len; i += 4) {
__m128 sseA = _mm_loadu_ps(&a[i]);
__m128 sseB = _mm_loadu_ps(&b[i]);
sum = _mm_add_ps(sseA, sum);
sum = _mm_add_ps(sseB, sum);
}
sum = _mm_hadd_ps(sum, sum);
sum = _mm_hadd_ps(sum, sum);
return _mm_cvtss_f32(sum);
}
int main() {
size_t N = 7
float *a = (float *)malloc(sizeof(float)*N);
a[0] = 1;
a[1] = 2;
a[2] = 3.21;
a[3] = 3.5;
a[4] = 3.1;
a[5] = 3.2;
a[6] = 0.2;
float *b = (float *)malloc(sizeof(float)*N);
b[0] = 1;
b[1] = 2;
b[2] = 3.2;
b[3] = 5.5;
b[4] = 3.1;
b[5] = 3.1;
b[6] = 0.21;
float ret = sum(a, b, N);
cout<<ret;
return 0;
}
This code output: 33.32
In Go lang i wrote the same function but i couldn't see the same result:
Go assembly file:
#include "textflag.h"
//func _sum(a []float32, b[]float32) float32
TEXT _sum(SB), NOSPLIT, $0-52
MOVQ a_base+0(FP), AX
MOVQ b_base+24(FP), CX
MOVQ a_len+8(FP), DX
VXORPS X0, X0, X0
loop:
CMPQ DX, $0
JE back
VMOVUPS (AX), X1
VADDPS (CX), X1, X1
VHADDPS X1, X1, X1
VHADDPS X1, X1, X1
VADDPS X1, X0, X0
ADDQ $10, AX
ADDQ $10, CX
SUBQ $4, DX
JMP loop
back:
MOVSS X0, ret+48(FP)
RET
Go main file:
package main
import "fmt"
//go:noescape
func _sum(a []float32, b []float32) float32
func main() {
a := []float32{1, 2, 3.21, 3.5, 3.1, 3.2, 0.2}
b := []float32{1, 1, 3.2, 5.5, 3.1, 3.1, 0.21}
ret := _sum(a, b)
fmt.Println(ret)
}
This code output: 1.1076422e+09