I found this very recent paper https://arxiv.org/pdf/1901.01926.pdf, where all is nicely typeset an explained.
Apparently there is always a compromise between time and space complexity and there is no algorithm (yet?) that is O(n)-in-time and in-place.
(The paper claims to be the first subquadratic deterministic in-place algorithm.)
The algorithm you posted is not listed in the paper and would be O(n) in time and O(n) in space (i.e. out-of-place).
I post it here for reference, also to check the correctness of other implementations.
I used zero-base indexing for simplicity.
// O(n) out-of-place algorithm
template<class It, class Size, class ItOut>
void inverse_permutation_n(It first, Size n, ItOut d_first){
for(Size i = 0; i != n; ++i){
d_first[first[i]] = i;
}
}
Then there is the algorithm that is listed as "folklore" in the table translated from the pseudocode in the paper (Listing 3 in the paper).
#include<algorithm> // for std::min
template<class It, class Index>
void reverse_cycle(It t, Index start){
auto cur = t[start];
auto prev = start;
while( cur != start ){
auto next = t[cur];
t[cur] = prev;
prev = cur;
cur = next;
}
t[start] = prev;
}
template<class It, class Index>
auto cycle_leader(It t, Index start){
auto cur = t[start];
auto smallest = start;
while(cur != start){
smallest = std::min(smallest, cur);
cur = t[cur];
}
return smallest;
}
// O(n²) in-place algorithm
template<class It, class Size>
void inverse_permutation_n(It first, Size n){
for(Size i = 0; i != n; ++i){
if( cycle_leader(first, i) == i ) reverse_cycle(first, i);
}
}
The above algorithm is O(n²)-time in the average case.
The reason is that for each point you have to follow a cycle of length O(n) to find the leader.
Then you can build on top of this by putting a shortcut in the cycle leader search, once the cycle leader is less than the starting point the search returns false.
template<class It, class Index>
auto cycle_leader_shortcut(It t, Index start){
auto cur = t[start];
while(cur != start){
if(cur < start) return false;
cur = t[cur];
}
return true;
}
// O(n log n) in-place algorithm
template<class It, class Size>
void inverse_permutation_shortcut_n(It first, Size n){
for(Size i = 0; i != n; ++i){
if( cycle_leader_shortcut(first, i) ) reverse_cycle(first, i);
}
}
This algorithm is fortunately O(N log N) (in average, I think).
The reason is that the cycles iteration become shorter as the point in the sequence increases because it is more likely that a point will have a low value that was already reversed.
This is the benchmark and the result:
#include<numeric> // for iota
#include<random>
// initialize rng
std::random_device rd;
std::mt19937 g(rd());
static void BM_inverse_permutation(benchmark::State& state){
// allocate memory and initialize test buffer and reference solution
auto permutation = std::vector<std::size_t>(state.range(0));
std::iota(permutation.begin(), permutation.end(), 0);
auto reference_inverse_permutation = std::vector<std::size_t>(permutation.size());
for(auto _ : state){
state.PauseTiming(); // to random shuffle and calculate reference solution
std::shuffle(permutation.begin(), permutation.end(), g);
// inverse_permutation_n(permutation.cbegin(), permutation.size(), reference_inverse_permutation.begin());
benchmark::DoNotOptimize(permutation.data());
benchmark::ClobberMemory();
state.ResumeTiming();
inverse_permutation_n(permutation.begin(), permutation.size());
benchmark::DoNotOptimize(permutation.data());
benchmark::ClobberMemory();
// state.PauseTiming(); // to check that the solution is correct
// if(reference_inverse_permutation != permutation) throw std::runtime_error{""};
// state.ResumeTiming();
}
state.SetItemsProcessed(state.iterations()*permutation.size() );
state.SetComplexityN(state.range(0));
}
BENCHMARK(BM_inverse_permutation)->RangeMultiplier(2)->Range(8, 8<<10)->Complexity(benchmark::oNSquared);
static void BM_inverse_permutation_shortcut(benchmark::State& state){
// allocate memory and initialize test buffer and reference solution
auto permutation = std::vector<std::size_t>(state.range(0));
std::iota(permutation.begin(), permutation.end(), 0);
auto reference_inverse_permutation = std::vector<std::size_t>(permutation.size());
for(auto _ : state){
state.PauseTiming(); // to random shuffle and calculate reference solution
std::shuffle(permutation.begin(), permutation.end(), g);
// inverse_permutation_n(permutation.cbegin(), permutation.size(), reference_inverse_permutation.begin());
benchmark::DoNotOptimize(permutation.data());
benchmark::ClobberMemory();
state.ResumeTiming();
inverse_permutation_shortcut_n(permutation.begin(), permutation.size());
benchmark::DoNotOptimize(permutation.data());
benchmark::ClobberMemory();
// state.PauseTiming(); // to check that the solution is correct
// if(reference_inverse_permutation != permutation) throw std::runtime_error{""};
// state.ResumeTiming();
}
state.SetItemsProcessed(state.iterations()*permutation.size() );
state.SetComplexityN(state.range(0));
}
BENCHMARK(BM_inverse_permutation_shortcut)->RangeMultiplier(2)->Range(8, 8<<10)->Complexity(benchmark::oNLogN);
BENCHMARK_MAIN();
$ c++ a.cpp -O3 -DNDEBUG -lbenchmark && ./a.out
2021-03-30 21:16:55
Running ./a.out
Run on (12 X 4600 MHz CPU s)
CPU Caches:
L1 Data 32K (x6)
L1 Instruction 32K (x6)
L2 Unified 256K (x6)
L3 Unified 12288K (x1)
Load Average: 1.26, 1.80, 1.76
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-----------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
-----------------------------------------------------------------------------------------------
BM_inverse_permutation/8 476 ns 478 ns 1352259 items_per_second=16.7276M/s
BM_inverse_permutation/16 614 ns 616 ns 1124905 items_per_second=25.9688M/s
BM_inverse_permutation/32 1106 ns 1107 ns 630398 items_per_second=28.9115M/s
BM_inverse_permutation/64 2929 ns 2931 ns 238236 items_per_second=21.835M/s
BM_inverse_permutation/128 10748 ns 10758 ns 64708 items_per_second=11.898M/s
BM_inverse_permutation/256 41556 ns 41582 ns 16600 items_per_second=6.15656M/s
BM_inverse_permutation/512 164006 ns 164023 ns 4245 items_per_second=3.12151M/s
BM_inverse_permutation/1024 621341 ns 620840 ns 1056 items_per_second=1.64938M/s
BM_inverse_permutation/2048 2468060 ns 2466060 ns 293 items_per_second=830.474k/s
BM_inverse_permutation/4096 10248540 ns 10244982 ns 93 items_per_second=399.805k/s
BM_inverse_permutation/8192 55926122 ns 55926230 ns 10 items_per_second=146.479k/s
BM_inverse_permutation_BigO 0.82 N^2 0.82 N^2
BM_inverse_permutation_RMS 18 % 18 %
BM_inverse_permutation_shortcut/8 499 ns 501 ns 1193871 items_per_second=15.9827M/s
BM_inverse_permutation_shortcut/16 565 ns 567 ns 1225056 items_per_second=28.2403M/s
BM_inverse_permutation_shortcut/32 740 ns 742 ns 937909 items_per_second=43.1509M/s
BM_inverse_permutation_shortcut/64 1121 ns 1121 ns 619016 items_per_second=57.0729M/s
BM_inverse_permutation_shortcut/128 1976 ns 1977 ns 355982 items_per_second=64.745M/s
BM_inverse_permutation_shortcut/256 3644 ns 3645 ns 191387 items_per_second=70.2375M/s
BM_inverse_permutation_shortcut/512 7282 ns 7288 ns 95434 items_per_second=70.2481M/s
BM_inverse_permutation_shortcut/1024 14732 ns 14752 ns 47417 items_per_second=69.4165M/s
BM_inverse_permutation_shortcut/2048 30590 ns 30398 ns 23079 items_per_second=67.3728M/s
BM_inverse_permutation_shortcut/4096 64374 ns 64039 ns 10766 items_per_second=63.9613M/s
BM_inverse_permutation_shortcut/8192 196961 ns 195786 ns 3646 items_per_second=41.8416M/s
BM_inverse_permutation_shortcut_BigO 1.74 NlgN 1.73 NlgN
BM_inverse_permutation_shortcut_RMS 27 % 27 %