In R
, I have a set of time intervals, some of which overlap each other, and those overlaps can form chains of overlaps (interval A overlaps B, B overlaps C, but A does not overlap C). I want to find the minimum set of intervals that covers this set of intervals.
I have a solution using lubridate
intervals, but it uses older paradigms, such as pushing and popping a stack of intervals. That solution is below. I am wondering if I am missing a simpler functional solution or package that should be doing this for me (I am worried that my code is fragile, and would rather use a tried and tested solution).
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(lubridate))
# In this table of intervals, rows 3,4, and 5 form a chain of intervals. They should be rolled into 1 interval.
# And note rows 3 and 5 do not themselves overlap, but they are chained together by having overlap with row 4.
dat <- read.csv(text="
start,end
2017-09-01 00:00,2017-09-01 00:01
2017-09-01 00:02,2017-09-01 00:03
2017-09-01 00:04,2017-09-01 00:08
2017-09-01 00:07,2017-09-01 00:15
2017-09-01 00:09,2017-09-01 00:16
2017-09-01 00:20,2017-09-01 00:22") %>%
transmute(
gtStart = ymd_hm(start)
, gtEnd = ymd_hm(end))
iv_clean <- list()
iv_process <- interval(dat$gtStart, dat$gtEnd)
while(length(iv_process) > 0) {
e <- iv_process[1]
iv_process <- iv_process[-1]
## If e is last item in iv_process, add it to iv_clean and stop processing
if (!length(iv_process)) {
if (!length(iv_clean)) {
iv_clean <- e
} else {
iv_clean <- c(e, iv_clean)
}
break
}
## For every remaining interval that overlaps e, union it with e
## And trip a flag that says that we found an overlapping interval
e_nonoverlapping <- TRUE
for (i in 1:length(iv_process)) {
if (int_overlaps(e, iv_process[i])) {
e_nonoverlapping <- FALSE
iv_process[i] <- union(e, iv_process[i])
}
}
## If e did not overlap with any interval, then add it to iv_clean
## Otherwise, don't, and continue processing iv_process
if (e_nonoverlapping) {
if (!length(iv_clean)) {
iv_clean <- e
} else {
iv_clean <- c(e, iv_clean)
}
}
}
## Print result
print(iv_clean)
#> [1] 2017-09-01 00:20:00 UTC--2017-09-01 00:22:00 UTC
#> [2] 2017-09-01 00:04:00 UTC--2017-09-01 00:16:00 UTC
#> [3] 2017-09-01 00:02:00 UTC--2017-09-01 00:03:00 UTC
#> [4] 2017-09-01 00:00:00 UTC--2017-09-01 00:01:00 UTC