I have the following simplified code fragment that produces slightly different results at each execution. Results also differ from the serial version. Could anyone please tell me what is wrong with it ?
#pragma omp declare reduction(vec_float_plus : std::vector<float> : \
std::transform(omp_out.begin(), omp_out.end(), omp_in.begin(), omp_out.begin(), std::plus<float>())) \
initializer(omp_priv = omp_orig)
vector<float> vec0;
vec0.resize(M);
std::fill (vec0.begin(),vec0.end(),0);
#pragma omp parallel for reduction(vec_float_plus : vec0)
for (int i0=0;i0<M;i0++)
{
for (int j0=0;j0<M;j0++)
{
vec0[j0]+=A[i0]*B[i0]*C[i0]; // A, B, C are vectors of size M
}
}