I am new to SSE intrinsics and try to optimise my code by it. Here is my program about counting array elements which are equal to the given value.
I changed my code to SSE version but the speed almost doesn't change. I am wondering whether I use SSE in a wrong way...
This code is for an assignment where we're not allowed to enable compiler optimization options.
No SSE version:
int get_freq(const float* matrix, float value) {
int freq = 0;
for (ssize_t i = start; i < end; i++) {
if (fabsf(matrix[i] - value) <= FLT_EPSILON) {
freq++;
}
}
return freq;
}
SSE version:
#include <immintrin.h>
#include <math.h>
#include <float.h>
#define GETLOAD(n) __m128 load##n = _mm_load_ps(&matrix[i + 4 * n])
#define GETEQU(n) __m128 check##n = _mm_and_ps(_mm_cmpeq_ps(load##n, value), and_value)
#define GETCOUNT(n) count = _mm_add_ps(count, check##n)
int get_freq(const float* matrix, float givenValue, ssize_t g_elements) {
int freq = 0;
int i;
__m128 value = _mm_set1_ps(givenValue);
__m128 count = _mm_setzero_ps();
__m128 and_value = _mm_set1_ps(0x00000001);
for (i = 0; i + 15 < g_elements; i += 16) {
GETLOAD(0); GETLOAD(1); GETLOAD(2); GETLOAD(3);
GETEQU(0); GETEQU(1); GETEQU(2); GETEQU(3);
GETCOUNT(0);GETCOUNT(1);GETCOUNT(2);GETCOUNT(3);
}
__m128 shuffle_a = _mm_shuffle_ps(count, count, _MM_SHUFFLE(1, 0, 3, 2));
count = _mm_add_ps(count, shuffle_a);
__m128 shuffle_b = _mm_shuffle_ps(count, count, _MM_SHUFFLE(2, 3, 0, 1));
count = _mm_add_ps(count, shuffle_b);
freq = _mm_cvtss_si32(count);
for (; i < g_elements; i++) {
if (fabsf(matrix[i] - givenValue) <= FLT_EPSILON) {
freq++;
}
}
return freq;
}