Here is the simple code which reproduces the error that I get:
#include <math.h>
#include <iostream>
//#include <omp.h>
//handling Not a number exception:
#include <fenv.h>
#include <signal.h>
#include "unistd.h"
void handler(int sig)
{
printf("Floating Point Exception\n");
exit(0);
}
#define EKCOR
const float alpha=200.0/137;
const int N=4096;//4096;//8192;//16384;
const float md=940;
const float Ep=0.1f;
float E1;
int STEP=1;
struct float3
{
float x, y, z;
};
float3 Pi;
struct Particle
{
float x;
float y;
float z;
float t;
float vx;
float vy;
float vz;
float m;
};
Particle p[N] __attribute__((aligned(64)));
inline float3 RandomDirection()
{
float number1 = static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float z = 2.0*number1 - 1.;
float rho = sqrtf((1.+z)*(1.-z));
float number2 = static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float phi = M_PI*2.0*number2;
float3 result={rho*cosf(phi), rho*sinf(phi), z};
return result;
}
void function()
{
float K=0.0;
Pi={0.0, 0.0, 0.0};
double Px=0.0;
double Py=0.0;
double Pz=0.0;
float P3=0.0;
float P4=0.0;
//#1
for(int i=0; i<N; ++i)
{
Px+=p[i].vx*p[i].m;
Py+=p[i].vy*p[i].m;
Pz+=p[i].vz*p[i].m;
float Penergy=0.0;
#pragma novector
for(int j=0; j<N; ++j) if(i!=j)
{
float rdist1=sqrt((p[i].x-p[j].x)*(p[i].x-p[j].x)+(p[i].y-p[j].y)*(p[i].y-p[j].y)+(p[i].z-p[j].z)*(p[i].z-p[j].z));
Penergy+=alpha/rdist1;
P4+=alpha/rdist1;
}
P3+=Penergy;
float v2=p[i].vx*p[i].vx+p[i].vy*p[i].vy+p[i].vz*p[i].vz;
K+=p[i].m*v2/2;
}
P4/=2;
Pi.x=Px;
Pi.y=Py;
Pi.z=Pz;
P3/=2;
float E2=K+P3;
float r=(E1-P3)/K;
std::cout<<"r="<<r<<",E1="<<E1<<",P3="<<P3<<",K="<<K<<std::endl;
float rc=sqrt(r);
std::cout<<"E2="<<K+P3<<",K="<<K<<",P3="<<P3<<",P4="<<P4<<",Px="<<Pi.x<<",Py="<<Pi.y<<",Pz="<<Pi.z<<std::endl;
}
void init()
{
const double pi=3.1415926536;
float RADIUS=pow(50.0*N,1.0/3.0);
Pi={0.0, 0.0, 0.0};
double Px=0.0, Py=0.0, Pz=0.0;
#pragma omp single
for(int i=0; i<N; ++i)
{
float DISTANCE=0.0f;
if(i>0)
{
while(DISTANCE<=1.0f)
{
float theta=pi*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float phi=2*pi*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float rr=RADIUS*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
p[i].x =rr*sin(theta)*cos(phi);
p[i].y =rr*sin(theta)*sin(phi);
p[i].z =rr*cos(theta);
DISTANCE=10000.0f;
#pragma simd reduction(min:DISTANCE)
for(int j=0; j<i; ++j)
{
float dij=sqrt((p[i].x-p[j].x)*(p[i].x-p[j].x)+(p[i].y-p[j].y)*(p[i].y-p[j].y)+(p[i].z-p[j].z)*(p[i].z-p[j].z));
if(dij<DISTANCE) DISTANCE=dij;
}
}
}
else
{
float theta=pi*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float phi=2*pi*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
float rr=RADIUS*static_cast <float> (rand())/(static_cast <float> (RAND_MAX));
p[i].x =rr*sin(theta)*cos(phi);
p[i].y =rr*sin(theta)*sin(phi);
p[i].z =rr*cos(theta);
}
float modv=sqrt(2.0*Ep/md);
float3 res=RandomDirection();
float3 v;
v.x=modv*res.x;
v.y=modv*res.y;
v.z=modv*res.z;
p[i].vx =v.x;
p[i].vy =v.y;
p[i].vz =v.z;
p[i].m=md;
Px+=p[i].vx*p[i].m;
Py+=p[i].vy*p[i].m;
Pz+=p[i].vz*p[i].m;
}
Px/=N;
Py/=N;
Pz/=N;
#pragma novector
for(int i=0; i<N; ++i)
{
p[i].vx-=Px/p[i].m;
p[i].vy-=Py/p[i].m;
p[i].vz-=Pz/p[i].m;
}
Px=0.0, Py=0.0, Pz=0.0;
float K1=0.0;
float P1=0.0;
float P2=0.0;
//#2
#pragma novector
for(int i=0; i<N; ++i)
{
Px+=p[i].vx*p[i].m;
Py+=p[i].vy*p[i].m;
Pz+=p[i].vz*p[i].m;
K1+=p[i].vx*p[i].vx+p[i].vy*p[i].vy+p[i].vz*p[i].vz;
float pp=0.0;
#pragma novector
for(int j=0; j<N; ++j) if(i!=j)
{
float rd=sqrt((p[i].x-p[j].x)*(p[i].x-p[j].x)+(p[i].y-p[j].y)*(p[i].y-p[j].y)+(p[i].z-p[j].z)*(p[i].z-p[j].z));
P1+=alpha/rd;
pp+=alpha/rd;
}
P2+=pp;
}
Pi.x=Px;
Pi.y=Py;
Pi.z=Pz;
K1*=md/2;
P1/=2;
P2/=2;
E1=K1+P1;
std::cout<<"INIT Px="<<Pi.x<<" Py="<<Pi.y<<" Pz="<<Pi.z<<" K1="<<K1<<" P1="<<P1<<" P2="<<P2<<" E1="<<E1<<std::endl;
}
int
main(int argc, char **argv)
{
//handling Not a number exception:
feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW);
signal(SIGFPE, handler);
//
init();
function();
}
At N<1024 P1=P2 and P3=P4. Only at N=256 there is a small difference:
N=256 P1=3492.48 P2=3492.5 P3=3492.5 P4=3492.48
But at N=1024 and N>1024 the difference becomes more and more:
N=1024 P1=34968.6 P2=34969.7 P3=34969.7 P4=34968.6
N=2048 P1=114493 P2=114482 P3=114482 P4=114493
N=4096 P1=357880 P2=362032 r=-9.14142
Here the program crashes, because r=-9.14142 and sqrt(r) throws floating point exception.
My OS is Fedora 23, processor is Intel Core i7-3770, and i used compilers gcc version 5.3.1 and intel c++ compiler icpc version 17.0.1, if it is necessary. They both gave an error even if not to use OpenMP.
The description of the trouble is given below the code. My question is:
Why P1 differs from P2 and P3 differs from P4 so great at N>=1024 (one may compile with Intel (icpc) or gcc (g++) compiler without arguments)? The program is running in 1 thread. They must have the same value!
I need to write the code so that the nested for loops #1 and #2 were parallelized using
#pragma omp parallel for reduction(+:P) for(int i=0; i(p[i].x-p[j].x)+(p[i].y-p[j].y)(p[i].y-p[j].y)+(p[i].z-p[j].z)*(p[i].z-p[j].z)); PP+=alpha/r; } P+=PP; } P/=2;
and worked with all optimization flags (i use the set -DCMAKE_CXX_FLAGS="-march=native -mtune=native -ipo16 -fp-model fast=2 -O3 -qopt-report=5 -mcmodel=large" for the Intel compiler). I can't do it (even with only "-O0"). May be because of the 1) error, it gives me a wrong value.