I created the below code in order to test my understanding of sse intrinsics. The code compiles and runs correctly but the improvement with sse is not very significant. Using sse intrinsics is approx. 20% faster. Should't it be roughly 4x faster or 400% improvement in speed? Is the compiler optimizing the scalar loop? If so how can this be disabled? Is something wrong with the sse_mult() function that I wrote?
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <emmintrin.h>
// gcc options -mfpmath=sse -mmmx -msse -msse2 \ Not sure if any are needed have been using -msse2
/*--------------------------------------------------------------------------------------------------
* SIMD intrinsics header files
*
* <mmintrin.h> MMX
*
* <xmmintrin.h> SSE
*
* <emmintrin.h> SSE2
*
* <pmmintrin.h> SSE3
*
* <tmmintrin.h> SSE3
*
* <smmintrin.h> SSE4.1
*
* <nmmintrin.h> SSE4.2
*
* <ammintrin.h> SSE4A
*
* <wmmintrin.h> AES
*
* <immintrin.h> AVX
*------------------------------------------------------------------------------------------------*/
#define n 1000000
// Global variables
float a[n]; // array to hold random numbers
float b[n]; // array to hold random numbers
float c[n]; // array to hold product a*b for scalar multiply
__declspec(align(16)) float d[n] ; // array to hold product a*b for sse multiply
// Also possible to use __attribute__((aligned(16))); to force correct alignment
// Multiply using loop
void loop_mult() {
int i; // Loop index
clock_t begin_loop, end_loop; // clock_t is type returned by clock()
double time_spent_loop;
// Time multiply operation
begin_loop = clock();
// Multiply two arrays of doubles
for(i = 0; i < n; i++) {
c[i] = a[i] * b[i];
}
end_loop = clock();
// Calculate time it took to run loop. Type int CLOCK_PER_SEC is # of clock ticks per second.
time_spent_loop = (double)(end_loop - begin_loop) / CLOCKS_PER_SEC;
printf("Time for scalar loop was %f seconds\n", time_spent_loop);
}
// Multiply using sse
void sse_mult() {
int k,i; // Index
clock_t begin_sse, end_sse; // clock_t is type returned by clock()
double time_spent_sse;
// Time multiply operation
begin_sse = clock();
// Multiply two arrays of doubles
__m128 x,y,result; // __m128 is a data type, can hold 4 32 bit floating point values
result = _mm_setzero_ps(); // set register to hold all zeros
for(k = 0; k <= (n-4); k += 4) {
x = _mm_load_ps(&a[k]); // Load chunk of 4 floats into register
y = _mm_load_ps(&b[k]);
result = _mm_mul_ps(x,y); // multiply 4 floats
_mm_store_ps(&d[k],result); // store result in array
}
int extra = n%4; // If array size isn't exactly a multiple of 4 use scalar ops for remainder
if(extra!=0) {
for(i = (n-extra); i < n; i++) {
d[i] = a[i] * b[i];
}
}
end_sse = clock();
// Calculate time it took to run loop. Type int CLOCK_PER_SEC is # of clock ticks per second.
time_spent_sse = (double)(end_sse - begin_sse) / CLOCKS_PER_SEC;
printf("Time for sse was %f seconds\n", time_spent_sse);
}
int main() {
int i; // Loop index
srand((unsigned)time(NULL)); // initial value that rand uses, called the seed
// unsigned garauntees positive values
// time(NULL) uses the system clock as the seed so values will be different each time
for(i = 0; i < n; i++) {
// Fill arrays with random numbers
a[i] = ((float)rand()/RAND_MAX)*10; // rand() returns an integer value between 0 and RAND_MAX
b[i] = ((float)rand()/RAND_MAX)*20;
}
loop_mult();
sse_mult();
for(i=0; i<n; i++) {
// printf("a[%d] = %f\n", i, a[i]); // print values to check
// printf("b[%d] = %f\n", i, b[i]);
// printf("c[%d] = %f\n", i, c[i]);
// printf("d[%d] = %f\n", i, d[i]);
if(c[i]!=d[i]) {
printf("Error with sse multiply.\n");
break;
}
}
return 0;
}