0

I have a function inside a class that i call millions of times in my code. There are demanding loops in this function that could be parallelized. My problem is that they perform summations that are stored in non-sclar variables. here is the code.

void Forces::ForRes(vector<vector<double> > & Posicoes,double epsilon,double sigma,double rc,double L)
{    

double rij_2,rij_6,rij_12,rijx,rijy,rijz;
double t;
double sigma6 = pow(sigma,6);
double sigma12 = pow (sigma6,2);              
for ( unsigned int i = 0 ; i < Posicoes.size() - 1 ; i++ )
{          
    for (unsigned int j = i + 1 ; j < Posicoes.size() ; j++)
    {
            rijx = (Posicoes[i][0]-Posicoes[j][0]) - L*round((Posicoes[i][0]-Posicoes[j][0])/L);
            rijy = (Posicoes[i][1]-Posicoes[j][1]) - L*round((Posicoes[i][1]-Posicoes[j][1])/L);
            rijz = (Posicoes[i][2]-Posicoes[j][2]) - L*round((Posicoes[i][2]-Posicoes[j][2])/L);
            rij_2 = rijx*rijx + rijy*rijy + rijz*rijz;
            rij_6 = pow(rij_2,3);
            rij_12 = pow(rij_6,2);

        if (rij_2 <= rc*rc)
        {
              U += 4*epsilon*((sigma12)/(rij_12)- (sigma6)/(rij_6));                                
            for (int k =0 ; k <3 ; k++)
            {         
                t = ((24*epsilon)/(rij_2))*(2*(sigma12)/(rij_12)- (sigma6)/(rij_6))*((Posicoes[i][k]-Posicoes[j][k]) 
                    - L*round((Posicoes[i][k]-Posicoes[j][k])/L));

                F[i][k] += t;           
                    F[j][k] -= t;
            }
       }            
    }          
}


}

Here is an example that i did in another part of the code:

    #pragma omp parallel for default(shared) reduction(+:K) private(pi_2)
    for (int i = 0 ; i < Nparticulas;i++)
        {  

         for (int k = 0 ; k < 3 ; k++)
             {
                pi_2 += Momentos.M[i][k]*Momentos.M[i][k];
             }

             K += pi_2/2;
             pi_2 = 0;  
        }  

Thanks in advance.

Code after @phadjido suggestion:

 void Forces::ForRes(vector<vector<double> > &    Posicoes,double epsilon,double sigma,double rc,double L)
 {

  double rij_2,rij_6,rij_12,rijx,rijy,rijz;
  double t;
  double sigma6 = pow(sigma,6);
  double sigma12 = pow (sigma6,2);

   U = 0;
   unsigned int j;

   for ( unsigned int i = 0 ; i < Posicoes.size() - 1 ; i++ )
   {   
       #pragma omp parallel  private (rij_2,rij_6,rij_12,j) 
        {

          double Up = 0; 
          vector <vector <double> > Fp(Posicoes.size() , vector<double>(Posicoes[0].size(),0));    

          #pragma omp for  
          for ( j = i + 1 ; j < Posicoes.size() ; j++)
              {
                 rijx = (Posicoes[i][0]-Posicoes[j][0]) - L*round((Posicoes[i][0]-Posicoes[j][0])/L);
                 rijy = (Posicoes[i][1]-Posicoes[j][1]) - L*round((Posicoes[i][1]-Posicoes[j][1])/L);
                 rijz = (Posicoes[i][2]-Posicoes[j][2]) - L*round((Posicoes[i][2]-Posicoes[j][2])/L);
                 rij_2 = rijx*rijx + rijy*rijy + rijz*rijz;
                 rij_6 = pow(rij_2,3);
                 rij_12 = pow(rij_6,2);

      if (rij_2 <= rc*rc)
         {

          Up += 4*epsilon*((sigma12)/(rij_12)- (sigma6)/(rij_6));


        for (int k =0 ; k <3 ; k++)
            {         
                 t = ((24*epsilon)/(rij_2))*(2*(sigma12)/(rij_12)- (sigma6)/(rij_6))*((Posicoes[i][k]-Posicoes[j][k]) 
                - L*round((Posicoes[i][k]-Posicoes[j][k])/L));
                Fp[i][k] += t;

                Fp[j][k] -= t;
                }
             }

         }

      #pragma omp atomic
      U += Up;
      for(j = i + 1 ; j < Posicoes.size() ; j++)
      {    
          for ( int k = 0 ; k < 3; k++)
              {            
            #pragma omp atomic
            F[i][k] += Fp[i][j];
            #pragma omp atomic
            F[j][k]  -= Fp[j][k];
              }

       }         
     }
   }

}

  • And what is your question? – slav Jul 28 '16 at 18:41
  • it´s in the question title "Parallel summation with openMP - what to do when i can´t use the reduction clause?" how to parallelize the loops in the first code. – Helton Maciel Jul 28 '16 at 18:56
  • 2
    You can use a user-defined reduction clause as discussed in [User Defined Reduction on vector of varying size](http://stackoverflow.com/questions/29633531/user-defined-reduction-on-vector-of-varying-size). – Tim Jul 28 '16 at 20:15
  • You should consider something like Verlet neighbour lists or cell-linked lists or [a combination of both](http://www.ee.bgu.ac.il/~specmeth/EMT04/pdf/session_2/2_14_04.pdf) to decrease the computational time. – Hristo Iliev Jul 29 '16 at 07:38
  • @Tim that is a fine suggestion , but since i'm new to this subject i couldn't get the 'declare reduction' clause to work with dynamic arrays. Any hint on this? – Helton Maciel Jul 29 '16 at 15:41
  • @HristoIliev i intend to do that in the future , but even in this case , parallelization would help. – Helton Maciel Jul 29 '16 at 16:36
  • If you perform domain decomposition with cell-linked lists, each thread could work on its own domain and then only the values in the border regions need to be reduced. This will both save memory and make the reduction faster. For many atomic configurations, the Verlet list alone can save you more computational time running in serial than computing everything in parallel. – Hristo Iliev Jul 29 '16 at 22:32

1 Answers1

0

If user defined reductions are not supported by the compiler, you can simply implement the reduction operation on your own. The code below shows how this can be done for your second example. Please note that pi_2 is initialized at the beginning of the loop. In your example, pi_2 is a private variable and might not have been initialized to zero. You would need firstprivate and proper initialization of pi_2 before the parallel region.

K = 0;
#pragma omp parallel private(pi_2)
{
    double local_K = 0;  /* initialize here */

    #pragma omp for
    for (int i = 0 ; i < Nparticulas;i++)
    {
        pi_2 = 0;  /* be careful */
        for (int k = 0 ; k < 3 ; k++)
        {
            pi_2 += Momentos.M[i][k]*Momentos.M[i][k];
        }
        local_K += pi_2/2;
    }

    #pragma omp atomic
        K += local_K;
}
phadjido
  • 394
  • 3
  • 9
  • So, i tried to implement your suggestion in the first part of my code. It worked fine for the scalar variabel , but for the matrix (i use the dynamic array library ) i still can´t get the proper results. Can you say what am i doing wrong? the code is in the question. – Helton Maciel Jul 29 '16 at 15:26
  • For start, the variables rijx,rijy,rijz, must be private too. For performance reasons, parallelize the outer loop and not the inner one. – phadjido Jul 30 '16 at 15:12