7

I'm working with origin-destination (OD) data that contains an origin ID and a destination ID in separate columns. Sometimes it's important to aggregate OD pairs that are the same, except that the origin and destination is swapped.

OD data looks like this:

orign             dest      value
E02002361         E02002361 109
E02002361         E02002363  38
E02002361         E02002367  10
E02002361         E02002371  44
E02002363         E02002361  34

In the above example, the first and the last rows can be considered as the same pair, except in the opposite direction. The challenge is how to efficiently identify that they are duplicates.

I created a package, stplanr, that can answer this question, as illustrated in the reproducible example below:

x = read.csv(stringsAsFactors = FALSE, text = "orign,dest,value
E02002361,E02002361,109
E02002361,E02002363,38
E02002361,E02002367,10
E02002361,E02002371,44
E02002363,E02002361,34")
duplicated(stplanr::od_id_order(x)[[3]])
#> Registered S3 method overwritten by 'R.oo':
#>   method        from       
#>   throw.default R.methodsS3
#> [1] FALSE FALSE FALSE FALSE  TRUE

Created on 2019-07-27 by the reprex package (v0.3.0)

The problem with this approach is that it's slow for large datasets.

I've looked at fastest way to get Min from every column in a matrix? which suggests that pmin is the most efficient way to get the minimum value across multiple columns (not 2), and we're already using this.

Unlike Removing duplicate combinations (irrespective of order) this question is about duplicate identification on only 2 columns and efficiency. The solution posted in Removing duplicate combinations (irrespective of order) seems to be slower than the slowest solution shown in the timings below.

A solution that is faster than this is the szudzik_pairing function, created by my colleague Malcolm Morgan and based on a method developed by Matthew Szudzik.

We've tried each approach and the Szudzik approach does indeed seem faster but I'm wondering: is there a more efficient way (in any language, but preferably implementable in R)?

Here is a quick reproducible example of what we've done, including a simple benchmark showing timings:

od_id_order_base <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
  data.frame(
    stringsAsFactors = FALSE,
    stplanr.id1 = x[[id1]],
    stplanr.id1 = x[[id2]],
    stplanr.key = paste(
      pmin(x[[id1]], x[[id2]]),
      pmax(x[[id1]], x[[id2]])
    )
  )
}

od_id_order_rfast <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
  data.frame(
    stringsAsFactors = FALSE,
    stplanr.id1 = x[[id1]],
    stplanr.id1 = x[[id2]],
    stplanr.key = paste(
      Rfast::colPmin(as.numeric(as.factor(x[[id1]])), as.numeric(as.factor(x[[id1]]))),
      pmax(x[[id1]], x[[id2]])
    )
  )
}

od_id_order_dplyr <- function(x, id1 = names(x)[1], id2 = names(x)[2]) {
  dplyr::transmute_(x,
    stplanr.id1 = as.name(id1),
    stplanr.id2 = as.name(id2),
    stplanr.key = ~paste(pmin(stplanr.id1, stplanr.id2), pmax(stplanr.id1, stplanr.id2))
  )
}

szudzik_pairing <- function(val1, val2, ordermatters = FALSE) {
  if(length(val1) != length(val2)){
    stop("val1 and val2 are not of equal length")
  }

  if(class(val1) == "factor"){
    val1 <- as.character(val1)
  }
  if(class(val2) == "factor"){
    val2 <- as.character(val2)
  }
  lvls <- unique(c(val1, val2))
  val1 <- as.integer(factor(val1, levels = lvls))
  val2 <- as.integer(factor(val2, levels = lvls))
  if(ordermatters){
    ismax <- val1 > val2
    stplanr.key <- (ismax * 1) * (val1^2 + val1 + val2) + ((!ismax) * 1) * (val2^2 + val1)
  }else{
    a <- ifelse(val1 > val2, val2, val1)
    b <- ifelse(val1 > val2, val1, val2)
    stplanr.key <- b^2 + a
  }
  return(stplanr.key)
}


n = 1000
ids <- as.character(runif(n, 1e4, 1e7 - 1))

x <- data.frame(id1 = rep(ids, times = n),
                id2 = rep(ids, each = n),
                val = 1,
                stringsAsFactors = FALSE)



head(od_id_order_base(x))
#>        stplanr.id1    stplanr.id1.1                       stplanr.key
#> 1 8515501.50763425 8515501.50763425 8515501.50763425 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 2454738.52108038 8515501.50763425
#> 3  223811.25236322 8515501.50763425  223811.25236322 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 4882305.41496906 8515501.50763425
#> 5  4663684.5752892 8515501.50763425  4663684.5752892 8515501.50763425
#> 6 725621.968830239 8515501.50763425 725621.968830239 8515501.50763425
head(od_id_order_rfast(x))
#>        stplanr.id1    stplanr.id1.1          stplanr.key
#> 1 8515501.50763425 8515501.50763425 830 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 163 8515501.50763425
#> 3  223811.25236322 8515501.50763425 135 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 435 8515501.50763425
#> 5  4663684.5752892 8515501.50763425 408 8515501.50763425
#> 6 725621.968830239 8515501.50763425 689 8515501.50763425
head(od_id_order_dplyr(x))
#> Warning: transmute_() is deprecated. 
#> Please use transmute() instead
#> 
#> The 'programming' vignette or the tidyeval book can help you
#> to program with transmute() : https://tidyeval.tidyverse.org
#> This warning is displayed once per session.
#>        stplanr.id1      stplanr.id2                       stplanr.key
#> 1 8515501.50763425 8515501.50763425 8515501.50763425 8515501.50763425
#> 2 2454738.52108038 8515501.50763425 2454738.52108038 8515501.50763425
#> 3  223811.25236322 8515501.50763425  223811.25236322 8515501.50763425
#> 4 4882305.41496906 8515501.50763425 4882305.41496906 8515501.50763425
#> 5  4663684.5752892 8515501.50763425  4663684.5752892 8515501.50763425
#> 6 725621.968830239 8515501.50763425 725621.968830239 8515501.50763425
head(szudzik_pairing(x$id1, x$id2))
#> [1]  2  5 10 17 26 37


system.time(od_id_order_base(x))
#>    user  system elapsed 
#>   0.467   0.000   0.467
system.time(od_id_order_rfast(x))
#>    user  system elapsed 
#>   1.063   0.001   1.064
system.time(od_id_order_dplyr(x))
#>    user  system elapsed 
#>   0.493   0.000   0.493
system.time(szudzik_pairing(x$id1, x$id2))
#>    user  system elapsed 
#>   0.100   0.000   0.101

Created on 2019-07-27 by the reprex package (v0.3.0)

devtools::session_info()
#> ─ Session info ──────────────────────────────────────────────────────────
#>  setting  value                       
#>  version  R version 3.6.0 (2019-04-26)
#>  os       Debian GNU/Linux 9 (stretch)
#>  system   x86_64, linux-gnu           
#>  ui       X11                         
#>  language (EN)                        
#>  collate  en_US.UTF-8                 
#>  ctype    en_US.UTF-8                 
#>  tz       Etc/UTC                     
#>  date     2019-07-27                  
#> 
#> ─ Packages ──────────────────────────────────────────────────────────────
#>  package      * version date       lib source        
#>  assertthat     0.2.1   2019-03-21 [1] CRAN (R 3.6.0)
#>  backports      1.1.4   2019-04-10 [1] CRAN (R 3.6.0)
#>  callr          3.3.0   2019-07-04 [1] CRAN (R 3.6.0)
#>  cli            1.1.0   2019-03-19 [1] CRAN (R 3.6.0)
#>  crayon         1.3.4   2017-09-16 [1] CRAN (R 3.6.0)
#>  desc           1.2.0   2018-05-01 [1] CRAN (R 3.6.0)
#>  devtools       2.1.0   2019-07-06 [1] CRAN (R 3.6.0)
#>  digest         0.6.20  2019-07-04 [1] CRAN (R 3.6.0)
#>  dplyr          0.8.3   2019-07-04 [1] CRAN (R 3.6.0)
#>  evaluate       0.14    2019-05-28 [1] CRAN (R 3.6.0)
#>  fs             1.3.1   2019-05-06 [1] CRAN (R 3.6.0)
#>  glue           1.3.1   2019-03-12 [1] CRAN (R 3.6.0)
#>  highr          0.8     2019-03-20 [1] CRAN (R 3.6.0)
#>  htmltools      0.3.6   2017-04-28 [1] CRAN (R 3.6.0)
#>  knitr          1.23    2019-05-18 [1] CRAN (R 3.6.0)
#>  magrittr       1.5     2014-11-22 [1] CRAN (R 3.6.0)
#>  memoise        1.1.0   2017-04-21 [1] CRAN (R 3.6.0)
#>  pillar         1.4.2   2019-06-29 [1] CRAN (R 3.6.0)
#>  pkgbuild       1.0.3   2019-03-20 [1] CRAN (R 3.6.0)
#>  pkgconfig      2.0.2   2018-08-16 [1] CRAN (R 3.6.0)
#>  pkgload        1.0.2   2018-10-29 [1] CRAN (R 3.6.0)
#>  prettyunits    1.0.2   2015-07-13 [1] CRAN (R 3.6.0)
#>  processx       3.4.0   2019-07-03 [1] CRAN (R 3.6.0)
#>  ps             1.3.0   2018-12-21 [1] CRAN (R 3.6.0)
#>  purrr          0.3.2   2019-03-15 [1] CRAN (R 3.6.0)
#>  R6             2.4.0   2019-02-14 [1] CRAN (R 3.6.0)
#>  Rcpp           1.0.1   2019-03-17 [1] CRAN (R 3.6.0)
#>  RcppZiggurat   0.1.5   2018-06-10 [1] CRAN (R 3.6.0)
#>  remotes        2.1.0   2019-06-24 [1] CRAN (R 3.6.0)
#>  Rfast          1.9.4   2019-05-25 [1] CRAN (R 3.6.0)
#>  rlang          0.4.0   2019-06-25 [1] CRAN (R 3.6.0)
#>  rmarkdown      1.13    2019-05-22 [1] CRAN (R 3.6.0)
#>  rprojroot      1.3-2   2018-01-03 [1] CRAN (R 3.6.0)
#>  sessioninfo    1.1.1   2018-11-05 [1] CRAN (R 3.6.0)
#>  stringi        1.4.3   2019-03-12 [1] CRAN (R 3.6.0)
#>  stringr        1.4.0   2019-02-10 [1] CRAN (R 3.6.0)
#>  testthat       2.1.1   2019-04-23 [1] CRAN (R 3.6.0)
#>  tibble         2.1.3   2019-06-06 [1] CRAN (R 3.6.0)
#>  tidyselect     0.2.5   2018-10-11 [1] CRAN (R 3.6.0)
#>  usethis        1.5.1   2019-07-04 [1] CRAN (R 3.6.0)
#>  withr          2.1.2   2018-03-15 [1] CRAN (R 3.6.0)
#>  xfun           0.8     2019-06-25 [1] CRAN (R 3.6.0)
#>  yaml           2.2.0   2018-07-25 [1] CRAN (R 3.6.0)
#> 
#> [1] /usr/local/lib/R/site-library
#> [2] /usr/local/lib/R/library
RobinLovelace
  • 4,799
  • 6
  • 29
  • 40
  • I saw only your first option `duplicated(stplanr::od_id_order(x)[[3]])` – akrun Jul 27 '19 at 20:07
  • 1
    https://stackoverflow.com/questions/9028369/removing-duplicate-combinations-irrespective-of-order is about data that has 3 or more columns, this has 2 and is about efficiency. – RobinLovelace Jul 27 '19 at 20:08
  • 1
    You could change the `szudzik_pairing` and implement in `C++` with Rcpp – akrun Jul 27 '19 at 20:10
  • Not sure if I understand correctly, but what about `with(df, paste0(pmax(orign, dest), pmin(orign, dest)))` to create the pairs irrespective of order and then keep the non-duplicated values? – tmfmnk Jul 27 '19 at 20:30
  • Seems fine, and basically what od_id_order does, but is surprisingly slow on large datasets. – RobinLovelace Jul 27 '19 at 20:32
  • Is the reproducible example realistic in that every entry is a duplicate? Other algorithms may be useful if there are a handful of duplicates. – Cole Jul 28 '19 at 02:10

1 Answers1

4

Here are four possible approaches to generate unique keys from tuples (x, y) where order doesn't matter: od_id_order_base and szudzik_pairing, per OP's question; a modified Szudzik method szudzik_pairing_alt; and a method max_min that uses the formula shown in this answer.

convert_to_numeric <- function(x, y) {
  if (length(x) != length(y)) stop("x and y are not of equal length")
  if (class(x) == "factor") x <- as.character(x)
  if (class(y) == "factor") y <- as.character(y)
  lvls <- unique(c(x, y))
  x <- as.integer(factor(x, levels = lvls))
  y <- as.integer(factor(y, levels = lvls))
  list(x = x, y = y)  
}

od_id_order_base <- function(x, y) {
  d <- convert_to_numeric(x, y)
  x <- d$x
  y <- d$y
  paste(pmin(x, y), pmax(x, y))
}

szudzik_pairing <- function(x, y) {
  d <- convert_to_numeric(x, y)
  x <- d$x
  y <- d$y
  a <- ifelse(x > y, y, x)
  b <- ifelse(x > y, x, y)
  b^2 + a
}

szudzik_pairing_alt <- function(x, y) {
  d <- convert_to_numeric(x, y)
  x <- d$x
  y <- d$y
  z <- y^2 + x
  ifelse(y < x, x^2 + y, z)
}

max_min <- function(x, y) {
  d <- convert_to_numeric(x, y)
  x <- d$x
  y <- d$y
  a <- pmax(x, y)
  b <- pmin(x, y)
  a * (a + 1) / 2 + b
}

Generate the keys from some sample data and verify that we get the same results:

check_dupe <- function(f, x, y) duplicated(f(x, y))

set.seed(123)
n <- 1000^2
x <- ceiling(runif(n) * 1000)
y <- ceiling(runif(n) * 1000)

p <- lapply(list(od_id_order_base, szudzik_pairing, szudzik_pairing_alt, 
  max_min), check_dupe, x, y)
all(sapply(p[-1], function(x) identical(p[[1]], as.vector(x))))
# [1] TRUE

Benchmarking:

bmk <- microbenchmark::microbenchmark(
  p1 = check_dupe(od_id_order_base, x, y),
  p2 = check_dupe(szudzik_pairing, x, y),
  p3 = check_dupe(szudzik_pairing_alt, x, y),
  p4 = check_dupe(max_min, x, y),
  times = 100
)
# Unit: seconds
#  expr      min       lq     mean   median       uq      max neval cld
#    p1 2.958512 3.089615 3.336621 3.336915 3.474973 4.721378   100   b
#    p2 1.934742 2.058185 2.191331 2.190588 2.306203 2.983729   100  a 
#    p3 1.889201 1.990306 2.173845 2.138995 2.259218 5.186751   100  a 
#    p4 1.870261 1.980756 2.143026 2.145458 2.234580 3.111324   100  a 
Weihuang Wong
  • 12,868
  • 2
  • 27
  • 48
  • I think you can further improve the speed if you use `x <- match(x, lvls)` and `y <- match(y, lvls)` instead of `x <- as.integer(factor(x, levels = lvls))` inside `convert_to_numeric`. Not tested though. – markus Jul 28 '19 at 08:04
  • An also cheating a bit: `max_min2 <- function(x, y) { a <- .Internal(pmax(x,y)); b <- .Internal(pmin(x,y)); a * (a + 1) / 2 + b } ` – chinsoon12 Jul 29 '19 at 01:59