Here's the baseline that I compare against. All fancy versions like std::ranges
should compile to similar code, so let's keep it simple.
Eigen::VectorXd fixed_values(
const Eigen::Ref<const Eigen::VectorXd>& xl,
const Eigen::Ref<const Eigen::VectorXd>& xu,
double tol)
{
Eigen::Index n = xl.size();
assert(xu.size() == n);
Eigen::VectorXd rtrn(n);
Eigen::Index out = 0;
for(Eigen::Index i = 0; i < n; ++i) {
double xli = xl[i], xui = xu[i];
if(std::abs(xli - xui) <= tol)
rtrn[(out++)] = (xli + xui) * 0.5;
}
rtrn.conservativeResize(out);
return rtrn;
}
Compile with -DNDEBUG
to avoid the cost of range checks in the loop. 100,000 calls with 10,000 elements per vector and ca. 5,000 output elements takes 2.4 seconds on my system.
Here is an optimized version.
Eigen::VectorXd fixed_values(
const Eigen::Ref<const Eigen::VectorXd>& xl,
const Eigen::Ref<const Eigen::VectorXd>& xu,
double tol)
{
Eigen::Index n = xl.size();
assert(xu.size() == n);
Eigen::VectorXd rtrn(n);
Eigen::Index out = 0;
for(Eigen::Index i = 0; i < n; ++i) {
double xli = xl[i], xui = xu[i];
rtrn[out] = (xli + xui) * 0.5;
out += std::abs(xli - xui) <= tol;
}
rtrn.conservativeResize(out);
return rtrn;
}
This takes 0.5 seconds in the same benchmark. The trick is to avoid the conditional jump. The redundant memory stores are dirt-cheap in comparison.
This pattern should work as long as the computation is relatively cheap. If it becomes expensive, it may make sense to instead use the same to generate the indices instead.
Eigen::VectorXd fixed_values(
const Eigen::Ref<const Eigen::VectorXd>& xl,
const Eigen::Ref<const Eigen::VectorXd>& xu,
double tol)
{
Eigen::Index n = xl.size();
Eigen::VectorXi indices(n);
const auto condition = (xl.array() - xu.array()).abs() <= tol;
Eigen::Index out = 0;
for(Eigen::Index i = 0; i < n; ++i) {
indices[out] = i;
out += condition[i];
}
auto used = indices.head(out);
return (xl(used, 0) + xu(used, 0)) * 0.5;
}
But don't do this for simple operations such as these above. Case in point: This takes 0.75 seconds.
Side-note: If you don't want to compile with -DNDEBUG
to keep the assertions active, change the code to work with the raw pointers from vector.data()
instead.
Intrinsics
AVX512 comes with a new compress-store instruction that can be used for this. However, I don't think compilers are good at using it, yet. But with some manual work, this can be exploited. I'M using Intel intrinsics for this:
#include <immintrin.h>
// using intel intrinsics
#include <cstdint>
// using std::uintptr_t
Eigen::VectorXd fixed_values(
const Eigen::Ref<const Eigen::VectorXd>& xl,
const Eigen::Ref<const Eigen::VectorXd>& xu,
double tol)
{
Eigen::Index n = xl.size();
assert(xu.size() == n);
Eigen::VectorXd rtrn(n);
const double* xlptr = xl.data(), *xuptr = xu.data();
double* outptr = rtrn.data();
/*
* Step 1: Scalar loop until input is aligned with cache-line
* boundary (=natural alignment of vectors).
* We do this on the input since the output will be misaligned
* anyway
*/
unsigned startbyte = static_cast<unsigned>(
reinterpret_cast<std::uintptr_t>(xlptr));
unsigned next_aligned = (startbyte + 63) & -64;
Eigen::Index misaligned = std::min<Eigen::Index>(
n, (next_aligned - startbyte) / sizeof(double));
Eigen::Index out = 0;
Eigen::Index i;
for(i = 0; i < misaligned; ++i) {
double xli = xlptr[i], xui = xuptr[i];
outptr[out] = (xli + xui) * 0.5;
out += std::abs(xli - xui) <= tol;
}
/*
* Step 2: Vectorized main loop.
* Might be worth comparing with a loop that uses 256 bit
* vectors for hardware that doesn't like 512 bit vectors
*/
const __m512d tol8 = _mm512_set1_pd(tol);
const __m512d half8 = _mm512_set1_pd(0.5);
for(; i + 8 <= n; i += 8) {
__m512d xli = _mm512_load_pd(xlptr + i);
__m512d xui = _mm512_loadu_pd(xuptr + i);
__m512d diff = _mm512_sub_pd(xli, xui);
__m512d absdiff = _mm512_abs_pd(diff);
__mmask8 mask = _mm512_cmp_pd_mask(absdiff, tol8, _CMP_LE_OQ);
__m512d sum = _mm512_add_pd(xli, xui);
__m512d mean = _mm512_mul_pd(sum, half8);
_mm512_mask_compressstoreu_pd(outptr + out, mask, mean);
unsigned imask = _cvtmask8_u32(mask);
out += _mm_popcnt_u32(imask);
}
/*
* Step 3: Scalar loop for last 0-7 elements
*/
for(; i < n; ++i) {
double xli = xlptr[i], xui = xuptr[i];
outptr[out] = (xli + xui) * 0.5;
out += std::abs(xli - xui) <= tol;
}
rtrn.conservativeResize(out);
return rtrn;
}
This runs in 0.166 seconds on my system.