4

I have two very long vectors:

a <- sample(1e+08L, size = 1e+09L, replace = TRUE)
b <- sample(1e+08L, size = 1e+09L, replace = TRUE)

I want to generate an integer vector r of length length(a) such that r[i] is the index of a[i] in b.

I tried pmatch(a, b) but it is very slow. Is there a more efficient way?


Desired output for a small example:

a <- c(1, 3, 5, 7, 8)
b <- c(3, 1, 7, 8, 5)
f(a, b)
## [1] 2 1 5 3 4
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
user18373817
  • 161
  • 7
  • 2
    Is there a perfect matching between a and b? That is every element in a is also present in b? Or can some elements have no matches? Can they have multiple matches? What then? – user2974951 Mar 16 '23 at 09:09
  • @user2974951 all elements of b are in a but not vice versa – user18373817 Mar 16 '23 at 09:13
  • 1
    Why `pmatch` here? `match` seems to be working the same – Maël Mar 16 '23 at 09:42
  • We must be clear here: `pmatch` is for _partial string matching_, `match` is for exact integer matching. Consider that `pmatch(1L, 100L)` returns `1L` simply because the first character in `"100"` is `"1"`. All of the answers here should be edited to exclude `pmatch` to avoid spreading this misconception about its purpose. – Mikael Jagan Mar 17 '23 at 13:53
  • See also: [Is there an R function for finding the index of an element in a vector?](https://stackoverflow.com/q/5577727/10488504) – GKi Mar 20 '23 at 12:44

7 Answers7

4

Your question mentions pmatch, which performs partial matching of character vectors, but it seems like you want match, which performs exact matching of integer and other vectors.

match is faster, but even faster than match is fastmatch::fmatch:

match(b, a)
fastmatch::fmatch(b, a)

Adding to the benchmarks:

library(fastmatch)

set.seed(1)
a <- sample(1e5, 1e5)
b <- sample(1e5, 1e4)

microbenchmark::microbenchmark(
  match = match(b, a),
  fastmatch = fmatch(b, a),
  check = "identical", 
  times = 100L)
Unit: microseconds
      expr      min       lq      mean    median        uq       max neval
     match 9439.020 9500.602 9659.7537 9519.6260 9555.0090 12394.546   100
 fastmatch  367.606  376.134  398.4347  382.0175  399.1145   614.467   100
Mikael Jagan
  • 9,012
  • 2
  • 17
  • 48
Maël
  • 45,206
  • 3
  • 29
  • 67
3

Benchmark:

library(fastmatch)   #fmatch
library(data.table)  #merge
library(collections) #hash

Rcpp::cppFunction('IntegerVector matchC(NumericVector x, NumericVector table) {
  IntegerVector out(x.size(), NA_INTEGER);
  for(int i = 0; i < x.size(); i++) {
    for(int j = 0; j < table.size(); j++) {
      if(x[i] == table[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')

Rcpp::cppFunction('IntegerVector matchC2(const std::vector<int>& x, const std::vector<int>& table) {
  IntegerVector out(x.size(), NA_INTEGER);
  std::unordered_map<int, int> lut;
  lut.max_load_factor(table.size()/(double)*max_element(table.begin(), table.end()));
  lut.reserve(table.size());
  for(int i = 0; i < table.size(); i++) lut[table[i]] = i+1;
  for(int i = 0; i < x.size(); i++) {
    auto search = lut.find(x[i]);
    if(search != lut.end()) out[i] = search->second;
  }
  return out;
}')

set.seed(1); a <- sample(1e5, 1e5); b <- sample(1e5, 1e4)
    
bench::mark(
  match = { match(a, b) },
  fmatch = { fmatch(a, b) },
  zx8754.merge = {
    merge(data.table(x = a, rnA = seq_along(a), key = "x"),
          data.table(x = b, rnB = seq_along(b), key = "x"),
          all.x = TRUE)[order(rnA), rnB] },
  sotos.Rcpp = { matchC(a, b) },
  GKi.Rcpp = { matchC2(a, b) },
  user2974951.hash = {
    h = dict(seq_along(b), b)
    sapply(a, h$get, default = NA)},
  "jblood94.[" = `[<-`(NA_integer_, b, seq_along(b))[a]
)

Result

  expression            min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>       <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 match              1.86ms   1.99ms   479.      951.3KB     7.98   240     4
2 fmatch             1.03ms   1.12ms   896.     393.34KB     6.00   448     3
3 zx8754.merge       4.71ms   5.02ms   181.       8.05MB    31.6     92    16
4 sotos.Rcpp          2.38s    2.38s     0.420    1.22MB     0        1     0
5 GKi.Rcpp         891.93µs  945.6µs  1018.     393.16KB     7.98   510     4
6 user2974951.hash 127.58ms 133.66ms     6.85     3.44MB    30.8      4    18
7 jblood94.[       227.28µs  244.8µs  3193.     800.85KB    48.0   1597    24
GKi
  • 37,245
  • 2
  • 26
  • 48
zx8754
  • 52,746
  • 12
  • 114
  • 209
2

If the objective is just to match(), then we can use:

cppFunction('IntegerVector matchC(NumericVector x, NumericVector table) {
  IntegerVector out(x.size(), NA_INTEGER);
  for(int i = 0; i < x.size(); i++) {
    for(int j = 0; j < table.size(); j++) {
      if(x[i] == table[j]) {
        out[i] = j + 1;
        break;
      }
    }
  }
  return out;
}')
Sotos
  • 51,121
  • 6
  • 32
  • 66
2

Alternatively you can create a hash table (which doesn't exist in base R for some reason) then you can look up the indices of the elements in b for every element in a in O(1) time.

> library(collections)
> h=dict(seq_along(b),b)
> sapply(a,h$get,default=NA)
[1] 2 1 5 3 4
user2974951
  • 9,535
  • 1
  • 17
  • 24
2

Convert to data.table and merge:

library(data.table)

merge(data.table(x = a, rnA = seq_along(a), key = "x"),
      data.table(x = b, rnB = seq_along(b), key = "x"),
      all.x = TRUE)[order(rnA), rnB]
# [1] 2 1 5 3 4
zx8754
  • 52,746
  • 12
  • 114
  • 209
1

Updated based on GKi's comment.

If a and b are both positive integer vectors, with length(b)~max(b), indexing will be even faster:

library(fastmatch)   #fmatch

intmatch1 <- function(a, b, maxb = max(b)) {
  B <- rep(NA_integer_, maxb)
  B[b] <- seq_along(b)
  B[a]
}

intmatch2 <- function(a, b) {
  `[<-`(NA_integer_, b, seq_along(b))[a]
}

set.seed(1); a <- sample(1e5); b <- sample(1e5, 1e4)

microbenchmark::microbenchmark(
  match = match(a, b),
  fmatch = fmatch(a, b),
  intmatch1.1 = intmatch1(a, b),
  intmatch1.2 = intmatch1(a, b, 1e5),
  intmatch2 = intmatch2(a, b),
  check = "identical")
#> Unit: microseconds
#>         expr      min       lq      mean    median       uq      max neval
#>        match 2543.101 2593.901 2800.5480 2646.7020 2732.801 7627.601   100
#>       fmatch 1107.801 1181.551 1272.0101 1201.4510 1261.601 5928.401   100
#>  intmatch1.1  316.401  375.451  418.3490  402.3005  424.351  658.001   100
#>  intmatch1.2  327.000  386.851  522.8529  401.7505  456.901 5670.001   100
#>    intmatch2  277.402  347.051  424.7270  359.6010  382.902 5026.601   100
jblood94
  • 10,340
  • 1
  • 10
  • 15
  • A shorter version could be `\`[<-\`(NA, b, seq_along(b))[a]` – GKi Mar 20 '23 at 12:09
  • 1
    Currently this will only work for integers > 0. So `min` of `a` and `b` might need to be considered. – GKi Mar 21 '23 at 10:27
1

Another Rcpp variant based on @Sotos but using unordered_map to find the index.

Rcpp::cppFunction('IntegerVector matchC2(const std::vector<int>& x, const std::vector<int>& table) {
  IntegerVector out(x.size(), NA_INTEGER);
  std::unordered_map<int, int> lut;
  lut.max_load_factor(table.size()/(double)*max_element(table.begin(), table.end()));
  lut.reserve(table.size());
  for(int i = 0; i < table.size(); i++) lut[table[i]] = i+1;
  for(int i = 0; i < x.size(); i++) {
    auto search = lut.find(x[i]);
    if(search != lut.end()) out[i] = search->second;
  }
  return out;
}')

matchC2(a, b)
#[1] 2 1 5 3 4
GKi
  • 37,245
  • 2
  • 26
  • 48