You can use sort
with the partial
argument. Here is an example:
# simulate data
set.seed(2)
X <- matrix(rnorm(10000), 100)
# use sort with partial
x1 <- -sort(-X, partial = 5)
# gives the same as sorting the whole thing
x2 <- tail(sort(X), 5)
setdiff(x1[1:5], x2)
#R> numeric(0)
It is a bit faster for the example above:
bench::mark(
`sort with partial` = -sort(-X, partial = 5),
`tail + sort` = tail(sort(X), 5),
min_time = 1, max_iterations = 1e6, check = FALSE)
#R> # A tibble: 2 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
#R> <bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list>
#R> 1 sort with partial 171µs 183µs 5281. 274KB 38.0 4592 33 870ms <NULL>
#R> 2 tail + sort 505µs 542µs 1814. 117KB 6.16 1768 6 974ms <NULL>
#R> # … with 3 more variables: memory <list>, time <list>, gc <list>
I am sure though that there is sorting method outside base R which has a partial sorting method with decreasing order as well. This should be faster because of the avoided copy.
Update
If this ever is a bottleneck for anyone then a Rcpp solution is:
#include "Rcpp.h"
#include <algorithm>
using namespace Rcpp;
inline bool rev_comp(double const i, double const j){
return i > j;
}
// [[Rcpp::export(rng = false)]]
NumericVector get_k_max(NumericVector x, unsigned const k) {
if(k >= x.size() or k < 1)
throw std::invalid_argument("Invalid k");
if(k + 1 == x.size())
return x;
NumericVector out = clone(x);
std::partial_sort(&out[0], &out[k + 1], &out[x.size() - 1], rev_comp);
return out;
}
Using Rcpp::sourceCpp
on a file with the above yields a fast solution:
# we get the same
x3 <- get_k_max(X, 5)
setdiff(x3[1:5], x2)
#R> numeric(0)
# it is faster
bench::mark(
Rcpp = get_k_max(X, 5),
min_time = 1, max_iterations = 1e6, check = FALSE)
#R> # A tibble: 1 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result
#R> <bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list>
#R> 1 Rcpp 19.2µs 22.9µs 42102. 78.2KB 91.9 30226 66 718ms <NULL>
#R> # … with 3 more variables: memory <list>, time <list>, gc <list>