Standard operations are very fast in C++ (+
, -
, etc).
Yet, exp
is more complicated to compute, so slower.
So, if we want some performance improvement, the more likely would be to be able to precompute the exp
computations.
Here, B(i) = \sum_{j < i}(x_i-x_j)exp^{-\beta(x_i - x_j)}
is equivalent to B(i) = \sum_{j < i}(x_i-x_j) / exp^{\beta x_i} * exp^{\beta x_j}
so that you can precompute the exp
for each index only (and also put the one depending on i
out of the loop). By refactoring it, you can do other precomputations. So, I put here the two previous solutions then my incremental solutions:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_org(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
for (int i = 1; i < n; i++) {
double temp = 0;
for (int j = 0; j <= i - 1; j++) {
temp += (x[i] - x[j]) * exp(-beta * (x[i] - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
double x_i;
for (int i = 1; i < n; ++i) {
double temp = 0;
x_i = x[i];
for (int j = 0; j <= i - 1; ++j) {
temp += (x_i - x[j]) * 1 / exp(beta * (x_i - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_2(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j] / x_exp[i];
}
B[i] = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_3(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j];
}
B[i] = temp / x_exp[i];
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_4(Rcpp::NumericVector x,
double beta = 3) {
Rcpp::NumericVector exp_pre = exp(beta * x);
Rcpp::NumericVector exp_pre_cumsum = cumsum(exp_pre);
Rcpp::NumericVector x_exp_pre_cumsum = cumsum(x * exp_pre);
return (x * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_5(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B(n);
double exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; i++) {
exp_pre = exp(beta * x[i]);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x[i] * exp_pre;
B[i] = (x[i] * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
/*** R
set.seed(111)
x = rnorm(1e3)
all.equal(
hawk_process_org(x),
hawk_process_cache(x)
)
all.equal(
hawk_process_org(x),
hawk_process_cache_2(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_3(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_4(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_5(x)[-1]
)
microbenchmark::microbenchmark(
hawk_process_org(x),
hawk_process_cache(x),
hawk_process_cache_2(x),
hawk_process_cache_3(x),
hawk_process_cache_4(x),
hawk_process_cache_5(x)
)
*/
Benchmark for x = rnorm(1e3)
:
Unit: microseconds
expr min lq mean median uq max neval cld
hawk_process_org(x) 19801.686 20610.0365 21017.89339 20816.1385 21157.4900 25548.042 100 d
hawk_process_cache(x) 20506.903 21062.1370 21534.47944 21297.8710 21775.2995 26030.106 100 e
hawk_process_cache_2(x) 1895.809 2038.0105 2087.20696 2065.8220 2103.0695 3212.874 100 c
hawk_process_cache_3(x) 430.084 458.3915 494.09627 474.2840 503.0885 1580.282 100 b
hawk_process_cache_4(x) 50.657 55.2930 71.60536 57.6105 63.5700 1190.260 100 a
hawk_process_cache_5(x) 43.373 47.0155 60.43775 49.6640 55.6235 842.288 100 a
This is much more effective than trying to gain nanoseconds from small optimizations that are likely to get your code more difficult to read.
But still, let's try the optimizations proposed by @coatless on my very last solution:
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_6(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B = Rcpp::no_init(n);
double x_i, exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; ++i) {
x_i = x[i];
exp_pre = exp(beta * x_i);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x_i * exp_pre;
B[i] = (x_i * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
Benchmark for x = rnorm(1e6)
:
Unit: milliseconds
expr min lq mean median uq max neval cld
hawk_process_cache_5(x) 42.52886 43.53653 45.28427 44.46688 46.74129 57.38046 100 a
hawk_process_cache_6(x) 42.14778 43.19054 45.93252 44.28445 46.51052 153.30447 100 a
Still not very convincing..