3

I would like to create a function using Rcpp that could outperform the pmax function from R base. I also tried to handle missing values inside of Rcpp function, and this might not be a good idea. All vector must have some missing values and they are all positive values. That is the reason I recoded missing to -1, so I could add it back in case a maximum value does not exists if all values are missing.

This was my first attempt, but no success yet:

library("benchr")
library("Rcpp")

Pmax <- function(...) {
  argd_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    List args0 = args[0];
    int n_arg = args.length();
    int n_vec = args0.length();
    NumericVector out(n_vec);
    out = args[0];
    for (int i = 1; i < n_arg; ++i) {
        NumericVector pa(n_vec);
        pa = args[i];
        for (int j = 0; j < n_vec; ++j) {
          if (R_IsNA(out[j])) {
            out[j] = -1;
          }
          if (R_IsNA(pa[j])) {
            pa[j] = -1;
          }
          out[j] = std::max(out[j], pa[j]);
        }
    }
    for (int j = 0; j < n_vec; ++j) {
      if (out[j] == -1) {
        out[j] = NA_REAL;
      }
    }
    return out;
  }
")
  output <- cpp_pmax(argd_list)
  return(output)
}


n <- 200000
x1 <- sample(0:1, n, replace = TRUE)
y1 <- sample(0:1, n, replace = TRUE)
z1 <- sample(0:1, n, replace = TRUE)
x1[sample(1:n, 90)]<-NA
y1[sample(1:n, 60)]<-NA
z1[sample(1:n, 70)]<-NA

pm1 <- Pmax(x1, y1, z1)
pm2 <- pmax(x1, y1, z1, na.rm = TRUE)

all(pm1 == pm2)

benchr::benchmark(pmax(x1, y1, z1, na.rm = TRUE),
                  Pmax(x1, y1, z1))

Benchmark summary:
  Time units : milliseconds 
expr                           n.eval   min lw.qu median  mean up.qu   max total relative
pmax(x1, y1, z1, na.rm = TRUE)    100  1.34  1.37   1.39  1.44  1.46  1.74   144     1.00
Pmax(x1, y1, z1)                  100 13.30 13.50  13.80 19.90 15.70 67.50  1990     9.88

Edit:

I've removed some loops and just replaced the -1 by NA outside of Rcpp, and it speed up a little bit, but still not outperform the R base pmax.

Although Rcpp::pmax is a nice implementation it only handles two vector and not sure if it can handle missing values. I got different results when there are missing vales.

The second attempt was:

Pmax1 <- function(...) {
  args_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    List args0 = args[0];
    int n_arg = args.length();
    int n_vec = args0.length();
    NumericVector out(n_vec);
    out = args[0];
    for (int i = 1; i < n_arg; ++i) {
        NumericVector pa(n_vec);
        pa = args[i];
        for (int j = 0; j < n_vec; ++j) {
          if (R_IsNA(out[j])) {
            out[j] = -1;
          }
          if (R_IsNA(pa[j])) {
            pa[j] = -1;
          }
          out[j] = std::max(out[j], pa[j]);
        }
    }
    return out;
  }
")
  output <- cpp_pmax(args_list)
  output[output == -1] <- NA
  return(output)
}

Pmax2 <- function(...) {
  args_list <- list(...)
  cppFunction("
  NumericVector cpp_pmax(List args) {
    NumericVector out = args[0];
    int n_arg = args.length();
    int n_vec = out.length();
    for (int j = 0; j < n_vec; ++j) {
      if (NumericVector::is_na(out[j])) out[j] = -1;
    }
    for (int i = 1; i < n_arg; ++i) {
      NumericVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (NumericVector::is_na(pa[j])) pa[j] = -1;
        out[j] = std::max(out[j], pa[j]);
      }
    }
    return out;
  }
")
  output <- cpp_pmax(args_list)
  output[output == -1] <- NA
  return(output)
}

n <- 200000
x <- sample(0:5, n, replace = TRUE)
y <- sample(0:5, n, replace = TRUE)
z <- sample(0:5, n, replace = TRUE)
w <- sample(0:5, n, replace = TRUE)
x[sample(1:n, 900)]<-NA
y[sample(1:n, 600)]<-NA
z[sample(1:n, 700)]<-NA
z[sample(1:n, 800)]<-NA

benchr::benchmark(pmax(x,  y, z, w, na.rm = TRUE),
                  Pmax1(x,  y, z, w),
                  Pmax2(x, y, z, w))

Benchmark summary:
  Time units : milliseconds 
                          expr n.eval   min lw.qu median  mean up.qu  max total relative
pmax(x, y, z, w, na.rm = TRUE)    100  2.38  2.43   2.46  2.46  2.48  2.6   246     1.00
Pmax1(x, y, z, w)                 100 16.00 16.90  17.20 19.40 17.70 61.2  1940     6.98
Pmax2(x, y, z, w)                 100  9.44  9.74   9.90 11.30 10.10 45.6  1130     4.02

Does anyone has an idea on how to make it faster than the R base pmax?

The idea was to have a generalized function to handle different number of vectors, all inside Rcpp function.

Update based on @DirkEddelbuettel and @Cole answer

Thank you for helping in optimize the code. Inspired by @DirkEddelbuettel and @Cole answers I just add the Rcpp::pmax to remove one of the loop and it helped also to speed it up.

library("bench")
library("Rcpp")

cppFunction("
  IntegerVector cpp_pmax1(List args) {
    IntegerVector tmp = args[0];
    IntegerVector out = clone(tmp);
    int n_arg = args.length();
    int n_vec = out.length();
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (pa[j] > out[j]) out[j] = pa[j];
      }
    }
    return out;
  }
")

cppFunction("
  IntegerVector cpp_pmax2(List args) {
    IntegerVector tmp = args[0];
    IntegerVector out = clone(tmp);
    int n_arg = args.length();
    int n_vec = out.length();
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      out = pmax(out, pa);
    }
    return out;
  }
")

Pmax1 <- function(...) {
  cpp_pmax1(list(...))
}


Pmax2 <- function(...) {
  cpp_pmax2(list(...))
}


n <- 200000
x <- sample(0:5, n, replace = TRUE)
y <- sample(0:5, n, replace = TRUE)
z <- sample(0:5, n, replace = TRUE)
w <- sample(0:5, n, replace = TRUE)
k <- sample(0:5, n, replace = TRUE)
x[sample(1:n, 900)] <- NA
y[sample(1:n, 600)] <- NA
z[sample(1:n, 700)] <- NA
w[sample(1:n, 800)] <- NA
k[sample(1:n, 800)] <- NA

pm0 <- pmax(x,  y, z, w, k, na.rm = TRUE)
pm1 <- Pmax1(x, y, z, w, k)
pm2 <- Pmax2(x, y, z, w, k)

benchr::benchmark(pmax(x,  y, z, w, k, na.rm = TRUE),
                  Pmax1(x, y, z, w, k),
                  Pmax2(x, y, z, w, k))


Benchmark summary:
  Time units : microseconds 
                             expr n.eval  min lw.qu median mean up.qu  max  total relative
pmax(x, y, z, w, k, na.rm = TRUE)    100 2880  2900   2920 3050  3080 8870 305000     5.10
Pmax1(x, y, z, w, k)                 100 2150  2180   2200 2310  2350 8060 231000     3.85
Pmax2(x, y, z, w, k)                 100  527   558    572  812   719 7870  81200     1.00
  

Thank you!

  • 1
    Isn't the main problem that you are compiling a [tag:Rcpp] in every call to your `Pmax` functions? When I compile the function separately, your version is about 50% slower than base `pmax`. It's not entirely realistic to assume that every function we make will outperform base R - there's nothing magic with [tag:Rcpp] that the wizards of R core are unable to do in C, especially when we talk about .Internal functions. – Cole Feb 03 '21 at 02:44
  • Right on. And we say so in the docs. Hard to (solidly) beat functions that are already vectorised, other than by skipping data checks etc. – Dirk Eddelbuettel Feb 03 '21 at 03:50

3 Answers3

4

As an aside, note that Rcpp sugar already has Rcpp::pmax():

> library(Rcpp)
> cppFunction("NumericVector pm(NumericVector x, NumericVector y) { 
+              return pmax(x,y);}")
> pm(10.0*(1:10), rep(50, 10))
 [1]  50  50  50  50  50  60  70  80  90 100
> pm(10.0*(1:10), c(rep(50, 8), NA, 50))
 [1]  50  50  50  50  50  60  70  80  NA 100
> 

There may well be scope for another, more general, function but hopefully this can help you too as a benchmark.

Edit: In my first version I accidentally called pmax() when I meant to call pm() (using Rcpp::pmax()). Results are the same.

pm() and pmax() are about of the same order for speed as one would expect as both are vectorised:

> library(microbenchmark)
> set.seed(123)
> x <- cumsum(rnorm(1e6))
> y <- cumsum(rnorm(1e6))
> microbenchmark(pmax(x,y), pm(x,y))
Unit: milliseconds
       expr     min      lq    mean  median      uq      max neval cld
 pmax(x, y) 3.94342 4.07488 4.66378 4.15433 5.39961  7.81931   100   a
   pm(x, y) 3.58781 3.68886 4.74249 3.75815 5.38444 22.31268   100   a
> 
Dirk Eddelbuettel
  • 360,940
  • 56
  • 644
  • 725
  • I tried to use it, but, Rcpp::pmax does not handle NA. ` cppFunction("NumericVector pm(NumericVector x, NumericVector y) {return pmax(x,y);}") n <- 20 x1 <- sample(0:5, n, replace = TRUE) y1 <- sample(0:5, n, replace = TRUE) x1[sample(1:n, 9)]<-NA y1[sample(1:n, 6)]<-NA pm(x1, y1) pmax(x1, y1, na.rm = TRUE) ` – Fernando Brito Lopes Feb 02 '21 at 20:08
3

I guess you can try fcoalesce + fifelse (both from data.table package) to define your Pmax function like below

Pmax <- function(..., na.rm = FALSE) {
  u <- list(...)
  if (na.rm) {
    return(
      Reduce(function(x, y) {
        x <- fcoalesce(x, y)
        y <- fcoalesce(y, x)
        fifelse(x <= y, y, x)
      }, u)
    )
  }
  Reduce(function(x, y) fifelse(x <= y, y, x), u)
}

Benchmark (Testing with the data in OP's post)

  • If you enable na.rm = TRUE, Pmax is slightly slower than base R pmax
> microbenchmark::microbenchmark(
+   pmax(x1, y1, z1, na.rm = TRUE),
+   Pmax(x1, y1, z1, na.rm = TRUE),
+   check = "equivalent",
+   unit = "relati ..." ... [TRUNCATED]
Unit: relative
                           expr      min      lq     mean   median       uq
 pmax(x1, y1, z1, na.rm = TRUE) 1.000000 1.00000 1.000000 1.000000 1.000000
 Pmax(x1, y1, z1, na.rm = TRUE) 1.428545 1.87539 1.974959 2.022579 2.094833
      max neval
 1.000000   100
 1.387139   100
  • If you used default na.rm option, you will find Pmax is slightly faster than base R pmax
> microbenchmark::microbenchmark(
+   pmax(x1, y1, z1),
+   Pmax(x1, y1, z1),
+   check = "equivalent",
+   unit = "relative"
+ )
Unit: relative
             expr      min      lq     mean   median       uq      max neval
 pmax(x1, y1, z1) 1.387953 1.32482 1.053983 1.220124 1.143867 0.266205   100
 Pmax(x1, y1, z1) 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000   100
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
3

There seem to be a few issues that memory allocations that can be seen from bench::mark uncover.

bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
            Pmax2(x, y, z, w))

## # A tibble: 2 x 13
##   expression                         min  median `itr/sec` mem_alloc
##   <bch:expr>                     <bch:t> <bch:t>     <dbl> <bch:byt>
## 1 pmax(x, y, z, w, na.rm = TRUE)  5.79ms  6.28ms     157.    781.3KB
## 2 Pmax2(x, y, z, w)              39.56ms 54.48ms      19.7    9.18MB

Memory Coercion

There is 10 times the memory allocation in comparison to base pmax(). Your is relatively straight forward, so this hints that there is some kind of coercion. And when looking at your sample data, you are sending integer vectors to a numeric signature. This creates a costly coercion. Let's update the signature and code to expect IntegerVectors. I simply changed everything from NumericVector to IntegerVector for this.

  expression                         min  median `itr/sec` mem_alloc
  <bch:expr>                     <bch:t> <bch:t>     <dbl> <bch:byt>
1 pmax(x, y, z, w, na.rm = TRUE)  1.89ms  2.33ms     438.    781.3KB
2 Pmax2_int(x, y, z, w)          37.42ms 49.88ms      17.6    2.32MB

Re-Compilation

The OP code includes cppFunction within the larger function code. Unless we need to recompile it every loop, we can instead compile and then call the compiled code from R. This is the biggest performance boost for this dataset size.

cppFunction("
  IntegerVector cpp_pmax_pre(List args) {
    IntegerVector out = args[0];
    int n_arg = args.length();
    int n_vec = out.length();
    for (int j = 0; j < n_vec; ++j) {
      if (IntegerVector::is_na(out[j])) out[j] = -1;
    }
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (IntegerVector::is_na(pa[j])) pa[j] = -1;
        out[j] = std::max(out[j], pa[j]);
      }
    }
    return out;
  }
")

Pmax2_int_pre <- function(...) {
  args_list <- list(...)
  output <- cpp_pmax_pre(args_list)
  output[output == -1] <- NA
  return(output)
}

bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
            Pmax2_int_pre(x, y, z, w))

## # A tibble: 2 x 13
##   expression                        min median `itr/sec` mem_alloc
##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>
## 1 pmax(x, y, z, w, na.rm = TRUE) 2.31ms 2.42ms      397.   781.3KB
## 2 Pmax2_int_pre(x, y, z, w)      2.48ms 3.55ms      270.    2.29MB

More memory and small optimizations

Finally, we still have more memory allocated. That hints we can do more - in this case we should update NA_REAL in . Related, we can optimize the loop assignment some.

cppFunction("
  IntegerVector cpp_pmax_final(List args) {
    IntegerVector out = args[0];
    int n_arg = args.length();
    int n_vec = out.length();
    for (int j = 0; j < n_vec; ++j) {
      if (IntegerVector::is_na(out[j])) out[j] = -1;
    }
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
// simplify logic; if the element is not na and is greater than the out, update out.
        if (!IntegerVector::is_na(pa[j]) & pa[j] > out[j]) out[j] = pa[j];
      }
    }
// update now in Rcpp instead of allocating vectors in R
    for (int i = 0; i < n_vec; i++) {
      if(out[i] == -1) out[i] = NA_INTEGER;
    }
    return out;
  }
")

Pmax2_final <- function(...) {
  cpp_pmax_final(list(...))
}

bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
            Pmax2_final(x, y, z, w))

## # A tibble: 2 x 13
##   expression                        min median `itr/sec` mem_alloc
##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>
## 1 pmax(x, y, z, w, na.rm = TRUE)    2ms 2.08ms      460.   781.3KB
## 2 Pmax2_final(x, y, z, w)        1.19ms 1.45ms      671.    2.49KB

We did it*! I am sure there could be small optimizations - we access pa[j] three times so it may be worthwhile to assign to a variable.

Bonus - NA_INTEGER

According to Rcpp for Everyone, the NA_INTEGER should be equivalent to the lowest integer value of -2147483648. Using this, we can remove the replacement of NA's because we can compare directly to NA when dealing with int data types.

During this realization, I also found an issue with the previous part - we need to clone the initial argument so that we are not accidently changing it by reference. Still, we're still slightly faster than base pmax().

cppFunction("
  IntegerVector cpp_pmax_last(List args) {
    IntegerVector tmp = args[0];
    IntegerVector out = clone(tmp);
    int n_arg = args.length();
    int n_vec = out.length();
    for (int i = 1; i < n_arg; ++i) {
      IntegerVector pa = args[i];
      for (int j = 0; j < n_vec; ++j) {
        if (pa[j] > out[j]) out[j] = pa[j];
      }
    }
    return out;
  }
")

Pmax2_last <- function(...) {
  cpp_pmax_last(list(...))
}

bench::mark(pmax(x,  y, z, w, na.rm = TRUE),
            Pmax2_last(x, y, z, w),
)

## # A tibble: 2 x 13
##   expression                        min median `itr/sec` mem_alloc `gc/sec`
##   <bch:expr>                     <bch:> <bch:>     <dbl> <bch:byt>    <dbl>
## 1 pmax(x, y, z, w, na.rm = TRUE) 5.98ms 6.36ms      154.     781KB        0
## 2 Pmax2_last(x, y, z, w)         5.09ms 5.46ms      177.     784KB        0
Cole
  • 11,130
  • 1
  • 9
  • 24
  • 1
    Nice work. At the end it gets us ... essentially the speed of the existing function. Ah well. :) – Dirk Eddelbuettel Feb 03 '21 at 03:49
  • 1
    Amazing! Liked your answer, +1!. May I ask if this can be generalized to the `float` values for row-wise maximum? – ThomasIsCoding Feb 03 '21 at 09:09
  • 1
    @ThomasIsCoding, I'm fairly certain this can easily be generalized to most `SEXP` types with some templates and possibly use of `RCPP_RETURN_VECTOR`. – Joseph Wood Feb 03 '21 at 13:17
  • Check out this post https://stackoverflow.com/a/19829440/4408538 and this excellent gallery article https://gallery.rcpp.org/articles/rcpp-return-macros/ – Joseph Wood Feb 03 '21 at 13:19
  • @JosephWood Thanks a lot! That's amazing and helpful! – ThomasIsCoding Feb 03 '21 at 13:21
  • That is great, thanks a lot for the very well explanation. I think I will add a conditional statement do deal with real and integer and pass it to the Rcpp function. – Fernando Brito Lopes Feb 03 '21 at 13:49
  • There is a small amount of overhead by casting `args[i]` to `IntegerVector` in every loop. Instead, do that once by putting the vectors into a `std::vector`. This saves an additional 10-15% time. – thc Feb 04 '21 at 17:42