As Gabe mentioned in his comment,
you can exploit the fact that your decision space can be interpreted as single integers:
set.seed(1234L)
N <- 10L
performance <- runif(2^N)
powers_of_two <- as.integer(rev(2L ^ (0L:(N - 1L))))
is_local_max <- sapply(0L:(2^N - 1), function(i) {
multipliers <- as.integer(rev(intToBits(i)[1L:N])) * -1L
multipliers[multipliers == 0L] <- 1L
neighbors <- i + powers_of_two * multipliers
# compensate that R vectors are 1-indexed
!any(performance[neighbors + 1L] > performance[i + 1L])
})
# compensate again
local_peaks_int <- which(is_local_max) - 1L
local_peaks_binary <- t(sapply(local_peaks_int, function(int) {
as.integer(rev(intToBits(int)[1L:N]))
}))
> head(local_peaks_binary)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 0 0 0 0 0 0 0 1 0 0
[2,] 0 0 0 0 1 0 0 1 1 0
[3,] 0 0 0 0 1 1 1 1 0 0
[4,] 0 0 0 1 0 0 0 1 1 1
[5,] 0 0 0 1 0 1 0 1 0 1
[6,] 0 0 0 1 1 0 1 1 1 0
In decimal,
multipliers
contains the the sign of the powers_of_two
so that,
when added to the current integer,
it represents a bit flip in binary.
For example,
if the original binary was 0 0
and we flip one bit to get 1 0
,
it's as if we added 2^1 in decimal,
but if it was originally 1 0
and we flip one bit to get 0 0
,
then we subtracted 2^1 in decimal.
Each row in local_peaks_binary
is a binary from your decision space,
where the least significant bit is on the right.
So, for example, the first local peak is a decimal 4.
See this question for the mapping of integers to binary.
EDIT: and if you want to do it in parallel:
library(doParallel)
set.seed(1234L)
N <- 20L
performance <- runif(2^N)
powers_of_two <- as.integer(rev(2 ^ (0:(N - 1))))
num_cores <- detectCores()
workers <- makeCluster(num_cores)
registerDoParallel(workers)
chunks <- splitIndices(length(performance), num_cores)
chunks <- lapply(chunks, "-", 1L)
local_peaks_int <- foreach(chunk=chunks, .combine=c, .multicombine=TRUE) %dopar% {
is_local_max <- sapply(chunk, function(i) {
multipliers <- as.integer(rev(intToBits(i)[1L:N])) * -1L
multipliers[multipliers == 0L] <- 1L
neighbors <- i + powers_of_two * multipliers
# compensate that R vectors are 1-indexed
!any(performance[neighbors + 1L] > performance[i + 1L])
})
# return
chunk[is_local_max]
}
local_peaks_binary <- t(sapply(local_peaks_int, function(int) {
as.integer(rev(intToBits(int)[1L:N]))
}))
stopCluster(workers); registerDoSEQ()
The above completes in ~2.5 seconds in my system,
which has 4 CPU cores.
Here is a C++ version that uses multi-threading but,
at least in my system with 4 threads,
it doesn't seem faster than Gabe's Fortran version.
However, when I try to run Gabe's Fortran code in a new session,
I get the following error with N <- 29L
:
cannot allocate vector of size 4.0 Gb
.
EDIT: Apparently I changed something important along the way,
because after testing again,
the C++ version actually seems faster.
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(RcppParallel)]]
#include <cstddef> // size_t
#include <vector>
#include <Rcpp.h>
#include <RcppParallel.h>
using namespace std;
using namespace Rcpp;
using namespace RcppParallel;
class PeakFinder : public Worker
{
public:
PeakFinder(const NumericVector& performance, vector<int>& peaks, const int N)
: performance_(performance)
, peaks_(peaks)
, N_(N)
{ }
void operator()(size_t begin, size_t end) {
vector<int> peaks;
for (size_t i = begin; i < end; i++) {
bool is_local_peak = true;
unsigned int mask = 1;
for (int exponent = 0; exponent < N_; exponent++) {
unsigned int neighbor = static_cast<unsigned int>(i) ^ mask; // bitwise XOR
if (performance_[i] < performance_[neighbor]) {
is_local_peak = false;
break;
}
mask <<= 1;
}
if (is_local_peak)
peaks.push_back(static_cast<int>(i));
}
mutex_.lock();
peaks_.insert(peaks_.end(), peaks.begin(), peaks.end());
mutex_.unlock();
}
private:
const RVector<double> performance_;
vector<int>& peaks_;
const int N_;
tthread::mutex mutex_;
};
// [[Rcpp::export]]
IntegerVector local_peaks(const NumericVector& performance, const int N) {
vector<int> peaks;
PeakFinder peak_finder(performance, peaks, N);
// each thread call will check at least 1024 values
parallelFor(0, performance.length(), peak_finder, 1024);
IntegerVector result(peaks.size());
int i = 0;
for (int peak : peaks) {
result[i++] = peak;
}
return result;
}
After saving the C++ code in local-peaks.cpp
,
this code:
library(Rcpp)
library(RcppParallel)
sourceCpp("local-peaks.cpp")
set.seed(1234L)
N <- 29L
performance <- runif(2^N)
system.time({
local_peaks_int <- local_peaks(performance, N)
})
finished in ~2 seconds
(without considering the time required to allocate performance
).
If you do need the binary representation,
you can change local_peaks
like this
(see this question):
// [[Rcpp::export]]
IntegerMatrix local_peaks(const NumericVector& performance, const int N) {
vector<int> peaks;
PeakFinder peak_finder(performance, peaks, N);
// each thread call will check at least 1024 values
parallelFor(0, performance.length(), peak_finder, 1024);
// in case you want the same order every time, #include <algorithm> and uncomment next line
// sort(peaks.begin(), peaks.end());
IntegerMatrix result(peaks.size(), N);
int i = 0;
for (int peak : peaks) {
for (int j = 0; j < N; j++) {
result(i, N - j - 1) = peak & 1;
peak >>= 1;
}
i++;
}
return result;
}