Below are computational times for the "IntervalSurgeon", "intervals" and "data.table" approaches. The times are for the dataset containing one million IDs, with 10 exposures per ID, i.e. 10M rows. I've done a single run due to the "intervals" approach taking too long.
The machine was MacBook Pro (15-inch, 2018), with 2.9 GHz Intel Core i9 processor, 32 GB 2400 MHz DDR4 memory, running on MacOC Mojave v. 10.14.6, max 12 threads.
The "data.table" approach was a clear winner, the same as "intervals" approach was a clear loser:
COMPUTATIONAL TIMES
"IntervalSurgeon way:"
user system elapsed
469.296 6.528 473.200
"intervals way:"
user system elapsed
2463.840 8.137 2476.685
"data.table way:"
user system elapsed
22.125 0.133 21.249
Interestingly, none of approaches benefited from setting a larger number of data.table threads through setDTthreads(). Splitting the dataset into 100 equal parts and passing to the "data.table" approach through parallel::mclapply (mc.cores = 10) brings the computational time below 5 seconds (the code is also provided below).
The "data.table + parallel::mclapply" combination approach also worked with much larger dataset of 50M IDs (generated the same as the 1M IDs dataset but with MM = 50). This dataset has 500M rows, split into 1000 subsets for mclapply. I set mc.cores = 4 so not to exhaust RAM. The single run computational time was 451.485 sec., not a bad result for a laptop! It would be great if this kind of analysis one day be incorporated into the data.table package, together with other similar types of interval analyses as in the IntervalSurgeon package.
GENERATING THE DATASET
library(data.table)
library(fst)
rm(list = ls())
gc()
set.seed(2019)
# how many millions of IDs required?
MM <- 1
N <- 1000000 * MM
# unique IDs
IDs <- paste0("ID", 1:N)
# number of exposures per ID
K <- 10
ss <- sample(1:365, N * K, replace = T)
ff <- sample(1:365, N * K, replace = T)
ss_s <- pmin(ss, ff)
ff_s <- pmax(ss, ff)
DT <- data.table(IDs = rep(IDs, each = K),
starts = as.integer(ss_s),
ends = as.integer(ff_s + 1))
DT[order(IDs, starts, ends),]
write_fst(DT, path = paste0("fst_DT_", MM, "Mx3.fst"))
DT2
IDs starts ends
1: ID1 4 22
2: ID1 16 233
3: ID1 19 224
4: ID1 31 227
5: ID1 38 147
---
9999996: ID999999 152 310
9999997: ID999999 160 218
9999998: ID999999 180 272
9999999: ID999999 215 332
10000000: ID999999 260 265
THE CODE FOR THREE APPROACHES
library(data.table)
library(IntervalSurgeon)
library(intervals)
library(fst)
rm(list = ls())
gc()
threads_no <- 2L
setDTthreads(threads_no)
print(getDTthreads())
# read the dataset generated above
#####################
DT <- read_fst("fst_DT_1Mx3.fst", as.data.table = T)
print("dataset dimentions:")
print(dim(DT))
#####################
# "intervals" function
#####################
myfun <- function( y ) {
data.table::as.data.table(
intervals::interval_union(
intervals::Intervals( as.matrix( y ) ), check_valid = TRUE )
)
}
#####################
# 1) "IntervalSurgeon" way
#####################
ptm1 <- proc.time()
tab_out1 <- DT[, as.list(data.table(
flatten(as.matrix(cbind(starts, ends))))),
by = IDs]
exec_time1 <- proc.time() - ptm1
print("IntervalSurgeon way:")
print(exec_time1)
#####################
# 2) "intervals" way
#####################
ptm2 <- proc.time()
tab_out2 <- DT[, myfun( .SD ), by = .(IDs)][,
c("V1", "V2") := lapply(.SD, as.integer), .SDcols = c("V1", "V2")]
exec_time2 <- proc.time() - ptm2
print("intervals way:")
print(exec_time2)
#####################
# 3) "data.table" way
#####################
ptm3 <- proc.time()
tab_out3 <- DT[, g := c(0L, cumsum(shift(starts, -1L) > cummax(ends))[-.N]), IDs][,
.(min(starts), max(ends)), .(g, IDs)][,g := NULL]
exec_time3 <- proc.time() - ptm3
print("data.table way:")
print(exec_time3)
#####################
print(identical(tab_out1, tab_out2))
print(identical(tab_out2, tab_out3))
"data.table + parallel::mclapply" COMBINATION APPROACH
library(data.table)
library(IntervalSurgeon)
library(intervals)
library(fst)
library(parallel)
rm(list = ls())
gc()
mc_cores <- 10
threads_no <- 2L
setDTthreads(threads_no)
print(getDTthreads())
DT <- read_fst("fst_DT_1Mx3.fst", as.data.table = T)
# split the dataset into G equal parts
#####################
G <- 100
nn <- nrow(DT)
step_c <- nn/G
# collect the indexes in the list
list_inx <- vector('list', G)
ii_low <- 1
ii_up <- step_c
for (i2 in 1:G) {
list_inx[[i2]] <- ii_low:ii_up
ii_low <- ii_up+1
ii_up <- step_c*(i2+1)
}
#####################
ptm3 <- proc.time()
# approach implementation
#####################
list_out <- mclapply(1:G, function(i) {
DT_tmp <- DT[list_inx[[i]],]
return(DT_tmp[, g := c(0L, cumsum(shift(starts, -1L) > cummax(ends))[-.N]), IDs][,
.(min(starts), max(ends)), .(g, IDs)][,g := NULL])
},
mc.cores = mc_cores
)
#####################
exec_time3 <- proc.time() - ptm3
print("data.table + parallel::mclapply:")
print(exec_time3)
SESSION INFO
sessionInfo()
R version 3.5.2 (2018-12-20)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Mojave 10.14.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
locale:
[1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base
other attached packages:
[1] intervals_0.15.1 IntervalSurgeon_1.0 fst_0.9.0 data.table_1.12.2
loaded via a namespace (and not attached):
[1] compiler_3.5.2 Rcpp_1.0.1