I want to assign the value of a matrix to the submat of another matrix as A.submat(ni1, ni2, nk1, nk2) = B;
It seems very slow. I'm wondering why it's so slow, and is there some way to improve it?
Here is my testing codes(Because the function "XForwarDifference" need to be called millions time in my project, I need to profile it better)
#include <armadillo>
#include <chrono>
#include <iostream>
using ms = std::chrono::milliseconds;
using ns = std::chrono::nanoseconds;
using get_time = std::chrono::steady_clock;
namespace {
const arma::ivec::fixed<5> iforward = {-1, 0, 1, 2, 3};
const double MCA_1 = -0.30874;
const double MCA0 = -0.6326;
const double MCA1 = 1.2330;
const double MCA2 = -0.3334;
const double MCA3 = 0.04168;
}
arma::mat XForwardDifference(arma::mat& mat,
const int& ni1,
const int& ni2,
const int& nk1,
const int& nk2,
const double& dx)
{
arma::mat ret(size(mat));
double sign_dx = 1./dx;
double mca_1= sign_dx * MCA_1;
double mca0 = sign_dx * MCA0;
double mca1 = sign_dx * MCA1;
double mca2 = sign_dx * MCA2;
double mca3 = sign_dx * MCA3;
auto t1 = get_time::now();
auto m1 = mat.submat(ni1+iforward(0), nk1, ni2+iforward(0), nk2);
auto t2 = get_time::now();
auto m2 = mca_1 * m1;
auto t3 = get_time::now();
auto m3 = m2 + m2;
auto t4 = get_time::now();
mat.submat(ni1, nk1, ni2, nk2) = m3;
auto t5 = get_time::now();
std::cout << std::chrono::duration_cast<ns>(t2-t1).count() << std::endl;
std::cout << std::chrono::duration_cast<ns>(t3-t2).count() << std::endl;
std::cout << std::chrono::duration_cast<ns>(t4-t3).count() << std::endl;
std::cout << std::chrono::duration_cast<ns>(t5-t4).count() << std::endl;
// ret.submat(ni1, nk1, ni2, nk2) =
// mca_1* mat.submat(ni1+iforward(0), nk1, ni2+iforward(0), nk2) +
// mca0 * mat.submat(ni1+iforward(1), nk1, ni2+iforward(1), nk2) +
// mca1 * mat.submat(ni1+iforward(2), nk1, ni2+iforward(2), nk2) +
// mca2 * mat.submat(ni1+iforward(3), nk1, ni2+iforward(3), nk2) +
// mca3 * mat.submat(ni1+iforward(4), nk1, ni2+iforward(4), nk2);
return ret;
}
int main(int argc, char *argv[])
{
const int len = 3;
int ni1, ni2, nk1, nk2, ni, nk;
ni = 200;
nk = 200;
ni1 = len;
ni2 = ni1 + ni - 1;
nk1 = len;
nk2 = nk1 + nk - 1;
const double dx = 1.;
auto start_time = get_time::now();
arma::mat mat(ni + 2*len, nk + 2*len);
mat = XForwardDifference(mat, ni1, ni2, nk1, nk2, dx);
auto end_time = get_time::now();
auto diff = end_time - start_time;
std::cout << "Elapsed time is : "
<< std::chrono::duration_cast<ns>(diff).count()
<< " ns "
<< std::endl;
return 0;
}
The output is:
180
116
110
851123
Elapsed time is : 961975 ns
You can see mat.submat(ni1, nk1, ni2, nk2) = m3;
covers the most of elapsed time.
hbrerkere gives the reason:
Armadillo queues up all the operations until the result is assigned to a matrix or a submatrix. This is why the assignment to submat seems to take longer. It's actually doing the multiplication and addition at assignment time, not before. Also, don't use the auto keyword with Armadillo matrices and expressions, as that can lead to problems. – hbrerkere
auto t1 = get_time::now();
arma::mat m1 = mat.submat(ni1+iforward(0), nk1, ni2+iforward(0), nk2);
auto t2 = get_time::now();
arma::mat m2 = mca_1 * m1;
auto t3 = get_time::now();
arma::mat m3 = m2 + m2;
auto t4 = get_time::now();
ret = m3;
auto t5 = get_time::now();
If I modify the codes as he said, then the output now is below:
391880
356480
373072
113051
Elapsed time is : 1352013 ns
I also encounter the situation that auto
will bring problems in armadillo.
auto m1 = mat.submat(ni1, nk1, ni2, nk2) * 2;
cout << size(m1) << endl;
It will print very large size that is not correct.