2

I've got a data.table that has UniqueID's for each row, an age column, and 2 columns with a confidence interval for the age. What I want to do is to merge rows that have an overlapping CI so the CI returned is the min/max of the overlap.

   ID   Age AgeMax AgeMin
1:  2 48073  49213  46933
2:  3 49002  49638  48366
3:  7 44297  44706  43888

The returned result for this example would then be:

ID  Age    AgeMax   AgeMin
2   48409   49638   46933
7   44297   44706   43888

As ID 2 and 3 have overlapping values in AgeMax and AgeMin. The AgeMax for ID == 2, is less than the AgeMax for ID == 3, but greater than the AgeMin for ID == 3 and therefore they overlap. ID 7 doesn't overlap with the other rows so it's returned as it was.

I actually don't mind what the ID and Age being returned is, as long as it's from one of the overlapping ID's

My attempt is below, but I'm not getting the right results

library(data.table)
# sequence of years
step <- 10
window <- 30
startYear <- -60000+(0.5*window)
endYear <- 0-(0.5*window)
yrSeq <- abs(seq(startYear, endYear, step))

# Example DT
DT <- structure(list(ID = c(2L, 3L, 7L), Age = c(48073L, 49002L, 44297L
), AgeMax = c(49213L, 49638L, 44706L), AgeMin = c(46933L, 48366L, 
43888L)), row.names = c(NA, -3L), class = c("data.table", "data.frame"
))

# split into a list to expand the CI's
s <- split(DT, DT$ID)

# Expand the CI's, to the nearest year in the seq
# merge back into a DT
d_seq <- rbindlist(lapply(s, function(x) {
      data.table(ID = x$ID, Yr = yrSeq[between(yrSeq, x$AgeMin, x$AgeMax)])}))

# remove duplicated years and return min and max years for each ID
d_seq <- d_seq[!duplicated(d_seq$Yr),]
d_seq <- d_seq[, .(AgeMin = min(Yr), AgeMax = max(Yr)), by = ID]

# merge with the original DT and select columns
DT <- merge(DT, d_seq, by = "ID")
DT <- DT[, c(1,2,5,6)]

Unfortunately, this isn't working as ID == 3 is being returned even though is overlaps with ID == 2 (as shown above) and now the AgeMin and AgeMax for ID == 2 doesn't cover the Age for that ID!

   ID   Age AgeMin AgeMax
1:  2 48073    46935    49205
2:  3 49002    49215    49635
3:  7 44297    43895    44705

I'm sure I'm overthinking this and that there must be a simple way that returns what I need, unfortunately I haven't been able to find any solutions.

I've already tried to modify the examples here and here.

Here is an extra example data.table to test on.

testDT <- structure(list(ID = c(54L, 57L, 58L, 60L, 61L, 62L, 64L, 180L
), Age = c(14219L, 13989L, 13883L, 13482L, 13403L, 13383L, 13340L, 
13994L), AgeMax = c(14343L, 14087L, 13972L, 13540L, 13465L, 13442L, 
13407L, 14083L), AgeMin = c(14095L, 13891L, 13794L, 13424L, 13341L, 
13324L, 13273L, 13905L)), row.names = c(NA, -8L), class = c("data.table", 
"data.frame"))
KaanKaant
  • 434
  • 3
  • 16

3 Answers3

1

I think you need a combination of data.table and igraph as the overlaps might be chaining endlessly even though the last ID might have no overlap with the first ID in the chain.

Here is an option:

#find overlapping intervals using data.table::foverlaps
setkey(setDT(testDT), AgeMin, AgeMax)
d <- unique(foverlaps(testDT, testDT)[, .(x=pmin(ID, i.ID), y=pmax(ID, i.ID))])
    
#find clusters of IDs with overlapping intervals
library(igraph)
g <- graph_from_data_frame(d, directed=FALSE)
m <- setDT(stack(clusters(g)$membership))[, ind := as.integer(as.character(ind))]

#lookup grouping using update join
testDT[m, on=.(ID=ind), cls := values]

#output
testDT[order(ID), .(ID=ID[1L], Age=Age[1L], AgeMin=min(AgeMin), AgeMax=max(AgeMax)),
    cls]

output:

   cls ID   Age AgeMin AgeMax
1:   3 54 14219  14095  14343
2:   2 57 13989  13794  14087
3:   1 60 13482  13273  13540

edit: Frank Zhang approach reminded me of David Aurenburg approach in How to flatten / merge overlapping time periods. Hence something like:

setDT(testDT)[order(AgeMin, AgeMax), g := 
    cumsum(c(0L, (shift(AgeMin, -1L) > cummax(AgeMax))[-.N]))
    ]

testDT[order(ID), .(ID=ID[1L], Age=Age[1L], 
        AgeMin=min(AgeMin), AgeMax=max(AgeMax)),
    g]

This should be faster.

chinsoon12
  • 25,005
  • 4
  • 25
  • 35
1

Here is a data.table solution

library(data.table)
setDT(testDT)

testDT[order(AgeMin)
      ][, .(AgeMin=min(AgeMin), AgeMax=max(AgeMax)),
       by=.(group=cumsum(c(1, tail(AgeMin, -1) > head(AgeMax, -1))))]
#>    group AgeMin AgeMax
#> 1:     1  13273  13540
#> 2:     2  13794  14087
#> 3:     3  14095  14343

The key of this solution is getting the group of overlapping periods.

Let's say we have two ranges p1 and p2. They have start and end named as start1,end1,start2end2.

There are only two conditions where p1 and p2 are not overalpping.

  1. start1 > end2
    OR
  2. end1 < start2

Since we already ordered Agemin ascendingly, we only need to consider conditioon 1 only. Then we can use cumsum to get the group indentifier.

Frank Zhang
  • 1,670
  • 7
  • 14
0

This solution is pretty readble and flexible, so adjust to your needs.

sample data

testDT <- structure(list(ID = c(54L, 57L, 58L, 60L, 61L, 62L, 64L, 180L
), Age = c(14219L, 13989L, 13883L, 13482L, 13403L, 13383L, 13340L, 
           13994L), AgeMax = c(14343L, 14087L, 13972L, 13540L, 13465L, 13442L, 
                               13407L, 14083L), AgeMin = c(14095L, 13891L, 13794L, 13424L, 13341L, 
                                                           13324L, 13273L, 13905L)), row.names = c(NA, -8L), class = c("data.table", 
                                                                                                                       "data.frame"))

code

library( data.table )
library( intervals )
#set testDT as data.table
setDT(testDT)
#assuming you want to merge all overlapping intervals to one long interval...
# create a table with joined intervals
# since inertvals need the min-col before max, we switch cols 3 and 4
DT.int <- as.data.table(
  intervals::interval_union( 
    intervals::Intervals( as.matrix( testDT[, 4:3] ) ) , 
    check_valid = TRUE ) )
#set colnames
setnames( DT.int, names(DT.int), c("AgeMin", "AgeMax" ) )
#add interval id's
DT.int[, interval_id := .I ][]
#    AgeMin AgeMax interval_id
# 1:  13273  13540           1
# 2:  13794  14087           2
# 3:  14095  14343           3

#now you can join the intervals back to DT, whatever/however you like
testDT[ DT.int, 
        `:=`( AgeMin.interval = i.AgeMin, AgeMax.interval = i.AgeMax, 
              interval.id = i.interval_id ),
        on = .( AgeMin <= AgeMax, AgeMax >= AgeMin ) ]

output

testDT

#     ID   Age AgeMax AgeMin AgeMin.interval AgeMax.interval interval.id
# 1:  54 14219  14343  14095           14095           14343           3
# 2:  57 13989  14087  13891           13794           14087           2
# 3:  58 13883  13972  13794           13794           14087           2
# 4:  60 13482  13540  13424           13273           13540           1
# 5:  61 13403  13465  13341           13273           13540           1
# 6:  62 13383  13442  13324           13273           13540           1
# 7:  64 13340  13407  13273           13273           13540           1
# 8: 180 13994  14083  13905           13794           14087           2

now process/summarise further as you like, for example: to get non-duplicated rows for interval.id...

testDT[ !duplicated( interval.id ), .(ID, Age, AgeMax, AgeMin) ]
#    ID   Age AgeMax AgeMin
# 1: 54 14219  14343  14095
# 2: 57 13989  14087  13891
# 3: 60 13482  13540  13424
Wimpel
  • 26,031
  • 1
  • 20
  • 37