I have the following triply-nested for loop and this is the performance bottleneck of my C++ code at the moment.
I want to ask people who are more experienced with xtensor
if what I have arrived at is the best I can get? I mean, is there a way I am missing, to make it work > 20 - 30 - 40 % faster?
Or generally, as C++ advice, if there is anything else I can do to make this piece of code work faster, I would appreciate it a lot if you told me.
I post a MWE code, then explain what I have tried to speed it up. It is visible from the commented parts of the code. Those produce almost equal performance
I measure as the performance the user-time it takes to go through all the triply-nested for-loop iterations.
I can only use 1 core for this (this MWE is part of a more complicated project which uses MPI).
#include <iostream>
#include <vector>
#include <algorithm>
#include <complex>
#include <fstream>
#include <string>
#include <list>
#include <chrono>
#include <thread>
#include <array>
#include <stdexcept>
#include <cmath>
#include <xtensor/xarray.hpp>
#include <xtensor/xfixed.hpp>
#include <xtensor/xio.hpp>
#include <xtensor/xcsv.hpp>
#include <xtensor-io/xhighfive.hpp>
#include <xtensor/xview.hpp>
#include <xtensor/xnpy.hpp>
#include <xtensor/xbuilder.hpp>
#include <xtensor/xrandom.hpp>
#include <xtensor/xmath.hpp>
#include <xtensor/xsort.hpp>
#include <xtensor-blas/xlinalg.hpp>
#include <xtensor-fftw/basic_double.hpp>
#include <xtensor-fftw/helper.hpp>
#include <fftw3.h>
#include <typeinfo>
#include <limits> // for printing with cout a double-type number with full precision
#include <sstream>
typedef std::numeric_limits<double> dbl;
#define _USE_MATH_DEFINES
using namespace xt::placeholders;
double Z = 1.0;
double Coulomb(const double & r) {
return (-Z / r);
}
template <typename BidirectionalIterator, typename T>
BidirectionalIterator getClosest(BidirectionalIterator first,
BidirectionalIterator last,
const T & value)
{
BidirectionalIterator before = std::lower_bound(first, last, value);
if (before == first) return first;
if (before == last) return --last;
BidirectionalIterator after = before;
--before;
return (*after - value) < (value - *before) ? after : before;
}
template <typename BidirectionalIterator, typename T>
std::size_t getClosestIndex(BidirectionalIterator first,
BidirectionalIterator last,
const T & value)
{
return std::distance(first, getClosest(first, last, value));
}
int main() {
const std::complex<double> I(0.0, 1.0); // imaginary unit, I*I = -1
const int Nb = 300;
const int l_max = 50;
int general_idx;
xt::xtensor_fixed<double, xt::xshape<3>> r_vec_cartesian = xt::zeros<double> ( {3} );
xt::xtensor_fixed<double, xt::xshape<3>> r_minus_b_vec_cartesian = xt::zeros<double> ( {3} );
double r_minus_b_vec_cartesian_modulus;
double arg3_imag_part;
int index_from_conhyp_results;
std::complex<double> result_cpp;
xt::xtensor_fixed<std::complex<double>, xt::xshape<1>> factor = xt::zeros<std::complex<double>> ( {1} );
double prefactor = 2.0;
xt::xtensor_fixed<double, xt::xshape<3>> b_dot_vec = xt::zeros<double> ( {3} );
xt::xtensor_fixed<double, xt::xshape<3>> b_dot_vec_times_r_vec = xt::zeros<double> ( {1} );
xt::xtensor<std::complex<double>, 3> tensor_of_exp_of_minusI_bdot_times_r = xt::zeros<std::complex<double>> ( {Nb, (l_max+1), (2*l_max+2)} );
xt::xtensor<double, 3> difference_in_potential_values = xt::zeros<double> ( {Nb, (l_max+1), (2*l_max+2)} );
xt::xtensor<std::complex<double>, 3> conhyp_results = xt::zeros<std::complex<double>> ( {100, 10000, 2} );
std::vector< xt::xtensor_fixed<double, xt::xshape<3>> > r_vecs;
for (a = 0; a <= (Nb-1); a++) {
for (b = 0; b <= l_max; b++) {
for (c = 0; c <= (2*l_max+1); c++) {
general_idxx = c + b*(2*l_max+2) + a*(2*l_max+2)*(l_max+1);
r_vecs.push_back( xt::xtensor_fixed<double, xt::xshape<3>> {0.0, 0.0, 0.0} );
r_vecs[general_idxx](0) = a * 1.0;
r_vecs[general_idxx](1) = b * 2.0;
r_vecs[general_idxx](2) = c * 3.0;
}
}
}
xt::xtensor_fixed<double, xt::xshape<Nb> > coulomb_of_ris;
for (int i = 0; i < Nb; i++) {
coulomb_of_ris(i) = i * 1.0;
}
int general_idx;
std::chrono::steady_clock::time_point start_for_loop_through_cords, end_for_loop_through_cords;
int k_idx = 10;
xt::xtensor_fixed<double, xt::xshape<10000>> to_search_in = xt::zeros<double> ( {10000} );
to_search_in = xt::imag(xt::view(conhyp_results, k_idx, xt::all(), 0)); // will be an array of doubles
auto it = to_search_in.begin();
start_for_loop_through_cords = std::chrono::steady_clock::now();
for (int a = 0; a <= (Nb-1); a++) {
for (int b = 0; b <= l_max; b++) {
for (int c = 0; c <= (2*l_max+1); c++) {
general_idx = c + b*(2*l_max+2) + a*(2*l_max+2)*(l_max+1);
// r_vec_cartesian(0) = ris(a, 0) * sin_thetas(b) * cos_phis(c);
// r_vec_cartesian(1) = ris(a, 0) * sin_thetas(b) * sin_phis(c);
// r_vec_cartesian(2) = ris(a, 0) * cos_thetas(b);
r_vec_cartesian = r_vecs[general_idx];
r_minus_b_vec_cartesian = r_vec_cartesian - b_vec;
r_minus_b_vec_cartesian_modulus = xt::sqrt( xt::linalg::dot(r_minus_b_vec_cartesian, r_minus_b_vec_cartesian) )(0);
// arg3_imag_part = -(k_wave*r_minus_b_vec_cartesian_modulus + double(xt::linalg::dot(k_vec_cartesian, r_minus_b_vec_cartesian)(0))) ;
arg3_imag_part = -( k_wave*r_minus_b_vec_cartesian_modulus + xt::linalg::dot(k_vec_cartesian, r_minus_b_vec_cartesian)(0) ) ;
index_from_conhyp_results = getClosestIndex(it, it + no_of_CONHYP_res_for_one_k_value, arg3_imag_part);
// result_cpp(0) = conhyp_results(k_idx, index_from_conhyp_results, 1);
result_cpp = conhyp_results(k_idx, index_from_conhyp_results, 1);
factor = xt::exp(I * xt::linalg::dot(k_vec_cartesian, r_minus_b_vec_cartesian));
Psi_scattering_prelim(a, b, c) = prefactor * factor(0) * result_cpp;
// Psi_scattering_prelim(a, b, c) = (prefactor * factor * result_cpp)(0);
b_dot_vec_times_r_vec = xt::linalg::dot(b_dot_vec, r_vec_cartesian);
tensor_of_exp_of_minusI_bdot_times_r(a, b, c) = xt::exp(-I * b_dot_vec_times_r_vec)(0);
// tensor_of_exp_of_minusI_bdot_times_r(a, b, c) = xt::exp(-I * xt::linalg::dot(b_dot_vec, r_vec_cartesian))(0);
// difference_in_potential_values(a, b, c) = Coulomb(ris(a, 0)) - Coulomb(r_minus_b_vec_cartesian_modulus);
difference_in_potential_values(a, b, c) = coulomb_of_ris(a) - Coulomb(r_minus_b_vec_cartesian_modulus);
}
}
}
end_for_loop_through_cords = std::chrono::steady_clock::now();
std::cout << "To go through the coordinates, it took: " << std::chrono::duration_cast<std::chrono::microseconds>(end_for_loop_through_cords - start_for_loop_through_cords).count() << " [us]" << std::endl;
return 0;
}
What I have tried:
- replace the
r_vec_cartesian(0) = ris(a, 0) * sin_thetas(b) * cos_phis(c);
andr_vec_cartesian(1) = ris(a, 0) * sin_thetas(b) * sin_phis(c);
andr_vec_cartesian(2) = ris(a, 0) * cos_thetas(b);
calculation with accessing a pre-computed array of results only:r_vec_cartesian = r_vecs[general_idx];
. - writing:
auto r_minus_b_vec_cartesian = r_vec_cartesian - b_vec;
and not declaringr_minus_b_vec_cartesian
as :xt::xtensor_fixed<double, xt::xshape<3>> r_minus_b_vec_cartesian = xt::zeros<double> ( {3} );
anymore. - updating the double
arg3_imag_part
with-(k_wave*r_minus_b_vec_cartesian_modulus + double(xt::linalg::dot(k_vec_cartesian, r_minus_b_vec_cartesian)(0)))
rather than with-( k_wave*r_minus_b_vec_cartesian_modulus + xt::linalg::dot(k_vec_cartesian, r_minus_b_vec_cartesian)(0) )
- having
factor
as axt::xarray_fixed<...>
and not as a plaindouble
, in order to use lazy-evaluation. - trying to have the function
Coulomb()
take inxt::xarray_fixed<...> &
arguments and returningxt::xarray_fixed<...>
, not adouble
value, in order to use:difference_in_potential_values(a, b, c) = (coulomb_of_ris(a) - Coulomb(something_not_evaluated))(0);
Thank you!