I am implementing array sum reduction. The result of my SSE code does not match that of the scalar code.
Am I using SSE intrinsics wrongly? Am I misunderstanding some fundamental concepts of SIMD programing?
Thanks in advance for any help!
gcc -Wall -Wextra -pedantic -std=c11 -march=native -o sum sumSSE.c
//sumSSE.c
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>
#include <immintrin.h>
//Scalar sum
float sc_sum(float *vec, uint32_t n)
{
float sum = 0.0;
for (uint32_t i = 0; i < n; i++)
sum += vec[i];
return sum;
}
//SSE sum
float li_sum(float *vec, uint32_t n)
{
float sum = 0.0;
float t[4];
uint32_t i, n4 = n>>2<<2;
__m128 v1;
__m128 v2 = _mm_setzero_ps();
for (i = 0; i < n4; i+=4) {
v1 = _mm_loadu_ps(&vec[i]);
v2 = _mm_add_ps(v2, v1);
}
for (; i < n; i++) sum += vec[i];
_mm_storeu_ps(t, v2);
sum += t[0] + t[1] + t[2] + t[3];
return sum;
}
int main(int argc, char **argv)
{
srand(time(NULL));
if (argc < 2) return -1;
uint32_t n = strtol(argv[1], NULL, 10);
float *vec = malloc(n * sizeof(float));
if (!vec) return -1;
//Fill array with random numbers in range 0-1
for (uint32_t i = 0; i < n; i++)
vec[i] = (float)rand()/RAND_MAX;
float r1 = sc_sum(vec, n);
fprintf(stderr, "R1: %.5f\n", r1);
float r2 = li_sum(vec, n);
fprintf(stderr, "R2: %.5f\n", r2);
fprintf(stdout, "r1-r2 == %.8f\n", r1-r2);
free(vec);
assert(r1 == r2);
}
Here is the absolute value of the difference between the scalar result and the SSE implementation.