4

I created the below code in order to test my understanding of sse intrinsics. The code compiles and runs correctly but the improvement with sse is not very significant. Using sse intrinsics is approx. 20% faster. Should't it be roughly 4x faster or 400% improvement in speed? Is the compiler optimizing the scalar loop? If so how can this be disabled? Is something wrong with the sse_mult() function that I wrote?

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <emmintrin.h>
// gcc options -mfpmath=sse -mmmx -msse -msse2 \ Not sure if any are needed have been using -msse2

/*--------------------------------------------------------------------------------------------------
 * SIMD intrinsics header files
 * 
 * <mmintrin.h>  MMX
 *
 * <xmmintrin.h> SSE
 *
 * <emmintrin.h> SSE2
 *
 * <pmmintrin.h> SSE3
 *
 * <tmmintrin.h> SSE3
 *
 * <smmintrin.h> SSE4.1
 *
 * <nmmintrin.h> SSE4.2
 *
 * <ammintrin.h> SSE4A
 *
 * <wmmintrin.h> AES
 *
 * <immintrin.h> AVX
 *------------------------------------------------------------------------------------------------*/

#define n 1000000

// Global variables
float a[n]; // array to hold random numbers
float b[n]; // array to hold random numbers
float c[n]; // array to hold product a*b for scalar multiply
__declspec(align(16)) float d[n] ; // array to hold product a*b for sse multiply
// Also possible to use __attribute__((aligned(16))); to force correct alignment

// Multiply using loop
void loop_mult() {
    int i; // Loop index

    clock_t begin_loop, end_loop; // clock_t is type returned by clock()
    double time_spent_loop;

    // Time multiply operation
    begin_loop = clock();   
        // Multiply two arrays of doubles
        for(i = 0; i < n; i++) {
            c[i] = a[i] * b[i];
        }
    end_loop = clock();

    // Calculate time it took to run loop. Type int CLOCK_PER_SEC is # of clock ticks per second.
    time_spent_loop = (double)(end_loop - begin_loop) / CLOCKS_PER_SEC;
    printf("Time for scalar loop was %f seconds\n", time_spent_loop);
}

// Multiply using sse
void sse_mult() {
    int k,i; // Index
    clock_t begin_sse, end_sse; // clock_t is type returned by clock()
    double time_spent_sse;

    // Time multiply operation
    begin_sse = clock();    
        // Multiply two arrays of doubles
        __m128 x,y,result; // __m128 is a data type, can hold 4 32 bit floating point values
        result = _mm_setzero_ps(); // set register to hold all zeros
        for(k = 0; k <= (n-4); k += 4) {
            x = _mm_load_ps(&a[k]); // Load chunk of 4 floats into register
            y = _mm_load_ps(&b[k]);
            result = _mm_mul_ps(x,y); // multiply 4 floats
            _mm_store_ps(&d[k],result); // store result in array
        }
        int extra = n%4; // If array size isn't exactly a multiple of 4 use scalar ops for remainder
        if(extra!=0) {
            for(i = (n-extra); i < n; i++) {
                d[i] = a[i] * b[i];
            }
        }
    end_sse = clock();

    // Calculate time it took to run loop. Type int CLOCK_PER_SEC is # of clock ticks per second.
    time_spent_sse = (double)(end_sse - begin_sse) / CLOCKS_PER_SEC;
    printf("Time for sse was %f seconds\n", time_spent_sse);
}

int main() {
    int i; // Loop index

    srand((unsigned)time(NULL)); // initial value that rand uses, called the seed
        // unsigned garauntees positive values
        // time(NULL) uses the system clock as the seed so values will be different each time

    for(i = 0; i < n; i++) {
        // Fill arrays with random numbers
        a[i] = ((float)rand()/RAND_MAX)*10; // rand() returns an integer value between 0 and RAND_MAX
        b[i] = ((float)rand()/RAND_MAX)*20;
    }

    loop_mult();
    sse_mult();
    for(i=0; i<n; i++) {
        // printf("a[%d] = %f\n", i, a[i]); // print values to check
        // printf("b[%d] = %f\n", i, b[i]);
        // printf("c[%d] = %f\n", i, c[i]);
        // printf("d[%d] = %f\n", i, d[i]);
        if(c[i]!=d[i]) {
            printf("Error with sse multiply.\n");
            break;
        }
    }


    return 0;
}
biononic
  • 95
  • 6
  • Set `n` to 2048 instead of 1000000. Turn on loop unrolling with `-funroll-loops`. Repeat your loop many times then divide the time by the repeat value. – Z boson Oct 21 '14 at 20:52
  • See http://stackoverflow.com/questions/25774190/l1-memory-bandwidth-50-drop-in-efficiency-using-addresses-which-differ-by-4096 for an example doing `z[i] = x[i] + y[i]` – Z boson Oct 21 '14 at 20:53
  • I don't see `-O2` or other optimization flag in the question? – Marc Glisse Oct 23 '14 at 17:48

1 Answers1

5

Your program is memory bound. SSE doesn't make a big difference as majority of time is spent reading those big arrays from RAM. Reduce the size of those arrays so they will fit in cache. Increase the number of passes instead. When all the data is already in cache, SSE version should perform noticeably faster.

Keep in mind there might be additional factors involved:

  • GCC can (to some extent) automatically vectorize loops. (I think it needs -O3 though)
  • First method you test will be slower because cache is not filled yet. You might want to run both methods alternately multiple times.
Piotr Praszmo
  • 17,928
  • 1
  • 57
  • 65
  • One additional thing I suggest is to turn on loop unrolling with `-funroll-loops`. – Z boson Oct 21 '14 at 21:05
  • What do you mean by increase the number of passes? Would it be better to create two separate programs, one for scalar loop and one for sse? – biononic Oct 22 '14 at 17:58
  • @biononic Instead of processing big arrays, process smaller arrays but do it in a loop many times. That's just one of ways to reduce the impact of cache misses. Two programs should help you isolate those two cases from each other, but this will only help with point #2. – Piotr Praszmo Oct 22 '14 at 18:25
  • Thanks! I am getting much closer to expected results when setting the array size to 2048 and then looping over the function call. Improvements are more noticeable for larger number of passes. – biononic Oct 22 '14 at 18:31