1

I just learned that there's a way to achieve some parallelization using intrinsics. I found the following code and wanted to go through it but I could understand much. I was trying make the operations be in single precision but how can I do that?

#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>

inline double pi_4 (int n){
        int i;
        __m128d mypart2,x2, b, c, one;
        double *x = (double *)malloc(n*sizeof(double));
        double *mypart = (double *)malloc(n*sizeof(double));
        double sum = 0.0;
        double dx = 1.0/n;
        double x1[2] __attribute__((aligned(16)));
        one = _mm_set_pd1(1.0); // set one to (1,1)
        for (i = 0; i < n; i++){
                x[i] = dx/2 + dx*i;
        }
        for (i = 0; i < n; i+=2){
                x1[0]=x[i]; x1[1]=x[i+1];
                x2 = _mm_load_pd(x1);
                b = _mm_mul_pd(x2,x2);
                c = _mm_add_pd(b,one);
                mypart2 = _mm_div_pd(one,c); 
                _mm_store_pd(&mypart[i], mypart2);
        }
        for (i = 0; i < n; i++)
                sum += mypart[i];       
        return sum*dx;
}

int main(){
        double res;
        res=pi_4(128);
        printf("pi = %lf\n", 4*res);
        return 0;
}

I was thinking of changing everything from double to float and call the correct intrinsic functions, for instance, instead of _mm_set_pd1 -> _mm_set_ps1. I don't know if this will make the program from double to single precision.

UPDATE

I tried like follows but I'm getting a segmentation fault

#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>

inline float pi_4 (int n){
        int i;
        __m128 mypart2,x2, b, c, one;
        float *x = (float *)malloc(n*sizeof(float));
        float *mypart = (float*)malloc(n*sizeof(float));
        float sum = 0.0;
        float dx = 1.0/n;
        float x1[2] __attribute__((aligned(16)));
        one = _mm_set_ps1(1.0); // set one to (1,1)
        for (i = 0; i < n; i++){
                x[i] = dx/2 + dx*i;
        }
        for (i = 0; i < n; i+=2){
                x1[0]=x[i]; x1[1]=x[i+1];
                x2 = _mm_load_ps(x1);
                b = _mm_mul_ps(x2,x2);
                c = _mm_add_ps(b,one);
                mypart2 = _mm_div_ps(one,c); 
                _mm_store_ps(&mypart[i], mypart2);
        }
        for (i = 0; i < n; i++)
                sum += mypart[i];       
        return sum*dx;
}
int main(){
        float res;
        res=pi_4(128);
        printf("pi = %lf\n", 4*res);
        return 0;
}
BRabbit27
  • 6,333
  • 17
  • 90
  • 161
  • You need to change __m128d to __m128 but otherwise you're on the right track. Just try it and see if it works. – rhashimoto Mar 07 '13 at 00:27
  • 1
    It is called "auto-vectorization". A decent recent compiler already does this automatically, gcc does. Look at the generated machine code. – Hans Passant Mar 07 '13 at 00:27
  • possible duplicate of [How to vectorize with gcc?](http://stackoverflow.com/questions/409300/how-to-vectorize-with-gcc) – Hans Passant Mar 07 '13 at 00:28
  • @rhashimoto OK. Let me try it and see what happens. Do you have, could you explain what the idea behind this code? Do you have other sources where I can learn a little bit more of this? – BRabbit27 Mar 07 '13 at 06:43
  • @HansPassant Yes I read gcc does it... most of the times... but sometimes is necessary to make this dirty, funny tricks manually. I'll check the other thread and see how it goes. How do I see the generated machine code? gcc -S ? – BRabbit27 Mar 07 '13 at 06:45
  • @BRabbit27, it looks like this code is numerically integrating 1/(1 + x^2) from 0 to 1 by approximating the area with n=128 rectangles. The closed form integral is `arctan` which is pi/4. The program uses SSE to calculate 2 or 4 (double or single precision) rectangle contributions at a time. – rhashimoto Mar 07 '13 at 19:38

1 Answers1

3

A few more fixes are needed:

  • x1 needs to be declared with 4 elements.
  • The second for loop needs to increment by 4 (this is what caused the segfault).
  • There need to be 4 assignments to the x1 array.

These changes are all because single-precision packs 4 values into a 16-byte vector register while double-precision packs only 2 values. I think that was it:

#include <stdio.h>
#include <stdlib.h>
#include <xmmintrin.h>

inline float pi_4 (int n){
   int i;
   __m128 mypart2,x2, b, c, one;
   float *x = (float *)malloc(n*sizeof(float));
   float *mypart = (float*)malloc(n*sizeof(float));
   float sum = 0.0;
   float dx = 1.0/n;
   float x1[4] __attribute__((aligned(16)));
   one = _mm_set_ps1(1.0); // set one to (1,1,1,1)
   for (i = 0; i < n; i++){
      x[i] = dx/2 + dx*i;
   }
   for (i = 0; i < n; i+=4){
      x1[0]=x[i]; x1[1]=x[i+1];
      x1[2]=x[i+2]; x1[3]=x[i+3];
      x2 = _mm_load_ps(x1);
      b = _mm_mul_ps(x2,x2);
      c = _mm_add_ps(b,one);
      mypart2 = _mm_div_ps(one,c); 
      _mm_store_ps(&mypart[i], mypart2);
   }
   for (i = 0; i < n; i++)
      sum += mypart[i];       
   return sum*dx;
}
int main(){
   float res;
   res=pi_4(128);
   printf("pi = %lf\n", 4*res);
   return 0;
}

Drum roll...

$ ./foo
pi = 3.141597

A word on the use of malloc(). I think most implementations will return memory aligned on a 16-byte boundary as required for SSE loads and stores, but that may not be guaranteed as __m128 is not a C/C++ type (it is guaranteed to be aligned for "normal" types). It would be safer to use memalign() or posix_memalign().

rhashimoto
  • 15,650
  • 2
  • 52
  • 80