Benchmark:
library(fastmatch) #fmatch
library(data.table) #merge
library(collections) #hash
Rcpp::cppFunction('IntegerVector matchC(NumericVector x, NumericVector table) {
IntegerVector out(x.size(), NA_INTEGER);
for(int i = 0; i < x.size(); i++) {
for(int j = 0; j < table.size(); j++) {
if(x[i] == table[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
Rcpp::cppFunction('IntegerVector matchC2(const std::vector<int>& x, const std::vector<int>& table) {
IntegerVector out(x.size(), NA_INTEGER);
std::unordered_map<int, int> lut;
lut.max_load_factor(table.size()/(double)*max_element(table.begin(), table.end()));
lut.reserve(table.size());
for(int i = 0; i < table.size(); i++) lut[table[i]] = i+1;
for(int i = 0; i < x.size(); i++) {
auto search = lut.find(x[i]);
if(search != lut.end()) out[i] = search->second;
}
return out;
}')
set.seed(1); a <- sample(1e5, 1e5); b <- sample(1e5, 1e4)
bench::mark(
match = { match(a, b) },
fmatch = { fmatch(a, b) },
zx8754.merge = {
merge(data.table(x = a, rnA = seq_along(a), key = "x"),
data.table(x = b, rnB = seq_along(b), key = "x"),
all.x = TRUE)[order(rnA), rnB] },
sotos.Rcpp = { matchC(a, b) },
GKi.Rcpp = { matchC2(a, b) },
user2974951.hash = {
h = dict(seq_along(b), b)
sapply(a, h$get, default = NA)},
"jblood94.[" = `[<-`(NA_integer_, b, seq_along(b))[a]
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 match 1.86ms 1.99ms 479. 951.3KB 7.98 240 4
2 fmatch 1.03ms 1.12ms 896. 393.34KB 6.00 448 3
3 zx8754.merge 4.71ms 5.02ms 181. 8.05MB 31.6 92 16
4 sotos.Rcpp 2.38s 2.38s 0.420 1.22MB 0 1 0
5 GKi.Rcpp 891.93µs 945.6µs 1018. 393.16KB 7.98 510 4
6 user2974951.hash 127.58ms 133.66ms 6.85 3.44MB 30.8 4 18
7 jblood94.[ 227.28µs 244.8µs 3193. 800.85KB 48.0 1597 24