1

My objective was to define unique non-overlapping ranges (or time intervals) per ID, when each ID had multiple, potentially overlapping ranges of exposure. I've found that "flatten" function from R "IntervalSurgeon" package can implement the task. My question is: how to efficiently implement the same task and get the same "tab_out" output in a "data.table" way?

library(data.table)
library(IntervalSurgeon)

set.seed(2019)

N <- 3 # number of IDs

IDs <- paste0("ID", 1:N) # unique IDs

K <- 4 # number of exposures per ID

DT <- data.table(IDs = rep(IDs, each = K), 
    starts = sample(1:20, N * K, replace = T))[,
    ends := starts + sample(1:5, N * K, replace = T)]


DT <- DT[order(IDs, starts),]

tab_out <- DT[, as.list(data.table(
    flatten(as.matrix(cbind(starts, ends))))), 
    by = IDs]

DT
    IDs starts ends
 1: ID1      7   11
 2: ID1     13   17
 3: ID1     15   16
 4: ID1     16   18
 5: ID2      1    5
 6: ID2      1    4
 7: ID2      2    3
 8: ID2     17   19
 9: ID3      3    6
10: ID3     13   16
11: ID3     14   15
12: ID3     16   21

tab_out
   IDs V1 V2
1: ID1  7 11
2: ID1 13 18
3: ID2  1  5
4: ID2 17 19
5: ID3  3  6
6: ID3 13 21
s_baldur
  • 29,441
  • 4
  • 36
  • 69

3 Answers3

3

Borrowing the idea from David Aurenburg's solution here

DT[, g := c(0L, cumsum(shift(starts, -1L) > cummax(ends))[-.N]), IDs][,
    .(min(starts), max(ends)), .(g, IDs)]

output:

   g IDs V1 V2
1: 0 ID1  7 11
2: 1 ID1 13 18
3: 0 ID2  1  5
4: 1 ID2 17 19
5: 0 ID3  3  6
6: 1 ID3 13 21
chinsoon12
  • 25,005
  • 4
  • 25
  • 35
1

Below are computational times for the "IntervalSurgeon", "intervals" and "data.table" approaches. The times are for the dataset containing one million IDs, with 10 exposures per ID, i.e. 10M rows. I've done a single run due to the "intervals" approach taking too long.

The machine was MacBook Pro (15-inch, 2018), with 2.9 GHz Intel Core i9 processor, 32 GB 2400 MHz DDR4 memory, running on MacOC Mojave v. 10.14.6, max 12 threads.

The "data.table" approach was a clear winner, the same as "intervals" approach was a clear loser:

COMPUTATIONAL TIMES

"IntervalSurgeon way:"
   user  system elapsed 
469.296   6.528 473.200 

"intervals way:"
    user   system  elapsed 
2463.840    8.137 2476.685 

"data.table way:"
   user  system elapsed 
 22.125   0.133  21.249 

Interestingly, none of approaches benefited from setting a larger number of data.table threads through setDTthreads(). Splitting the dataset into 100 equal parts and passing to the "data.table" approach through parallel::mclapply (mc.cores = 10) brings the computational time below 5 seconds (the code is also provided below).

The "data.table + parallel::mclapply" combination approach also worked with much larger dataset of 50M IDs (generated the same as the 1M IDs dataset but with MM = 50). This dataset has 500M rows, split into 1000 subsets for mclapply. I set mc.cores = 4 so not to exhaust RAM. The single run computational time was 451.485 sec., not a bad result for a laptop! It would be great if this kind of analysis one day be incorporated into the data.table package, together with other similar types of interval analyses as in the IntervalSurgeon package.

GENERATING THE DATASET

library(data.table)
library(fst)

rm(list = ls())
gc()

set.seed(2019)

# how many millions of IDs required?
MM <- 1 
N <- 1000000 * MM

# unique IDs
IDs <- paste0("ID", 1:N)

# number of exposures per ID
K <- 10 

ss <- sample(1:365, N * K, replace = T)
ff <- sample(1:365, N * K, replace = T)

ss_s <- pmin(ss, ff)
ff_s <- pmax(ss, ff)

DT <- data.table(IDs = rep(IDs, each = K), 
     starts = as.integer(ss_s), 
        ends =  as.integer(ff_s + 1))

DT[order(IDs, starts, ends),]

write_fst(DT, path = paste0("fst_DT_", MM, "Mx3.fst"))

DT2
                IDs starts ends
       1:      ID1      4   22
       2:      ID1     16  233
       3:      ID1     19  224
       4:      ID1     31  227
       5:      ID1     38  147
      ---                     
 9999996: ID999999    152  310
 9999997: ID999999    160  218
 9999998: ID999999    180  272
 9999999: ID999999    215  332
10000000: ID999999    260  265

THE CODE FOR THREE APPROACHES

library(data.table)
library(IntervalSurgeon)
library(intervals)
library(fst)

rm(list = ls())
gc()

threads_no <- 2L
setDTthreads(threads_no)
print(getDTthreads())

# read the dataset generated above
#####################
DT <- read_fst("fst_DT_1Mx3.fst", as.data.table = T)

print("dataset dimentions:")
print(dim(DT))
#####################

# "intervals" function
#####################
myfun <- function( y ) {
  data.table::as.data.table( 
    intervals::interval_union(
      intervals::Intervals( as.matrix( y ) ), check_valid = TRUE ) 
    )
}
#####################

# 1) "IntervalSurgeon" way
#####################
ptm1 <- proc.time()

tab_out1 <- DT[, as.list(data.table(
    flatten(as.matrix(cbind(starts, ends))))), 
        by = IDs]

exec_time1 <- proc.time() - ptm1
print("IntervalSurgeon way:")
print(exec_time1)
#####################       

# 2) "intervals" way  
##################### 
ptm2 <- proc.time()

tab_out2 <- DT[, myfun( .SD ), by = .(IDs)][,
    c("V1", "V2") := lapply(.SD, as.integer), .SDcols = c("V1", "V2")]

exec_time2 <- proc.time() - ptm2
print("intervals way:")
print(exec_time2)
##################### 

# 3) "data.table" way
#####################
ptm3 <- proc.time()

tab_out3 <- DT[, g := c(0L, cumsum(shift(starts, -1L) > cummax(ends))[-.N]), IDs][,
    .(min(starts), max(ends)), .(g, IDs)][,g := NULL]

exec_time3 <- proc.time() - ptm3
print("data.table way:")
print(exec_time3)
#####################  

print(identical(tab_out1, tab_out2))
print(identical(tab_out2, tab_out3))

"data.table + parallel::mclapply" COMBINATION APPROACH

library(data.table)
library(IntervalSurgeon)
library(intervals)
library(fst)
library(parallel)

rm(list = ls())
gc()

mc_cores <- 10

threads_no <- 2L
setDTthreads(threads_no)
print(getDTthreads())

DT <- read_fst("fst_DT_1Mx3.fst", as.data.table = T)

# split the dataset into G equal parts
##################### 
G <- 100
nn <- nrow(DT)
step_c <- nn/G

# collect the indexes in the list
list_inx <- vector('list', G)

ii_low <- 1
ii_up <- step_c
for (i2 in 1:G) {

    list_inx[[i2]] <- ii_low:ii_up 
    ii_low <- ii_up+1
    ii_up <- step_c*(i2+1)

    }
##################### 

ptm3 <- proc.time()

# approach implementation
#####################
list_out <- mclapply(1:G, function(i) {
            DT_tmp <- DT[list_inx[[i]],]
            return(DT_tmp[, g := c(0L, cumsum(shift(starts, -1L) > cummax(ends))[-.N]), IDs][,
                .(min(starts), max(ends)), .(g, IDs)][,g := NULL])
                },
                mc.cores = mc_cores
                )
#####################

exec_time3 <- proc.time() - ptm3
print("data.table + parallel::mclapply:")
print(exec_time3)

SESSION INFO

sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib

locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] intervals_0.15.1    IntervalSurgeon_1.0 fst_0.9.0           data.table_1.12.2  

loaded via a namespace (and not attached):
[1] compiler_3.5.2 Rcpp_1.0.1 
0

Here is a solution using the intervals-package..

sample data

library( data.table )
library( intervals )

DT <- fread("
IDs starts ends
ID1      7   11
ID1     13   17
ID1     15   16
ID1     16   18
ID2      1    5
ID2      1    4
ID2      2    3
ID2     17   19
ID3      3    6
ID3     13   16
ID3     14   15
ID3     16   21")

code

myfun <- function( y ) {
  data.table::as.data.table( 
    intervals::interval_union(
      intervals::Intervals( as.matrix( y ) ), check_valid = TRUE ) 
    )
}

DT[, myfun( .SD ), by = .(IDs)]

#    IDs V1 V2
# 1: ID1  7 11
# 2: ID1 13 18
# 3: ID2  1  5
# 4: ID2 17 19
# 5: ID3  3  6
# 6: ID3 13 21
Wimpel
  • 26,031
  • 1
  • 20
  • 37