1

I have a large data frame that looks like this.

I want to group_by seqnames and for each group, I want to check for overlapping ranges between the start and end. If there is any overlapping range, then it should stay the row with the highest score.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
df <- tibble(seqnames=rep(c("Chr1","Chr2"),each=3),
       start=c(100,200,300,100,200,300),
       end=c(150,400,500,120,220,320),
       score=c(1000,500,1000,1000,1000,1000))

df
#> # A tibble: 6 × 4
#>   seqnames start   end score
#>   <chr>    <dbl> <dbl> <dbl>
#> 1 Chr1       100   150  1000
#> 2 Chr1       200   400   500
#> 3 Chr1       300   500  1000
#> 4 Chr2       100   120  1000
#> 5 Chr2       200   220  1000
#> 6 Chr2       300   320  1000

Created on 2022-12-27 with reprex v2.0.2

the desired output is

  seqnames start   end score
  <chr>    <dbl> <dbl> <dbl>
 Chr1       100   150  1000
 Chr1       300   500  1000
 Chr2       100   120  1000
 Chr2       200   220  1000
 Chr2       300   320  1000
user438383
  • 5,716
  • 8
  • 28
  • 43
LDT
  • 2,856
  • 2
  • 15
  • 32
  • what would be the expected result if end of row 1 was 250 and score was 50, ie a chain of overlaps? – Waldi Dec 27 '22 at 18:01
  • yes if there is any indirect chain reaction it should be merged into the latter range. for example in the case you describe there should be only one line at the end `Chr1 300 500 1000` – LDT Dec 27 '22 at 18:06

2 Answers2

1

This works for your example. The last arrange call is only there to get the output in the same format as your desired output.

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

df <- tibble(seqnames=rep(c("Chr1","Chr2"),each=3),
             start=c(100,200,300,100,200,300),
             end=c(150,400,500,120,220,320),
             score=c(1000,500,1000,1000,1000,1000))


df |> 
  group_by(seqnames) |> 
  arrange(start) |> 
  mutate(
    remove = case_when(
      end > lead(start)  & score < lead(score) ~ TRUE,
      start < lag(end)  & score < lag(score) ~ TRUE,
      TRUE ~ FALSE)
    ) |>
  ungroup() |> 
    filter(!remove) |> 
  select(-remove) |>
  arrange(seqnames, start)
#> # A tibble: 5 × 4
#>   seqnames start   end score
#>   <chr>    <dbl> <dbl> <dbl>
#> 1 Chr1       100   150  1000
#> 2 Chr1       300   500  1000
#> 3 Chr2       100   120  1000
#> 4 Chr2       200   220  1000
#> 5 Chr2       300   320  1000

Created on 2022-12-27 with reprex v2.0.2

1

You could use ivs, see:

library(dplyr)
library(ivs)

df <- df %>% mutate(interval = iv(start, end))

df %>%
  group_by(seqnames) %>%
  mutate(interval_group = iv_identify_group(interval)) %>%
  group_by(seqnames,interval_group) %>%
  top_n(1,score) %>% 
  ungroup %>%
  select(seqnames, start,end,score)


# A tibble: 5 × 4
#  seqnames start   end score
#  <chr>    <dbl> <dbl> <dbl>
#1 Chr1       100   150  1000
#2 Chr1       300   500  1000
#3 Chr2       100   120  1000
#4 Chr2       200   220  1000
#5 Chr2       300   320  1000

or with data.table:

library(data.table)
library(ivs)

setDT(df)
df[,interval_group:=iv_identify_group(iv(start, end)),seqnames][
   ,.SD[score==max(score)],.(seqnames,interval_group)][
   ,.(seqnames,start,end,score)] 

#   seqnames start   end score
#     <char> <num> <num> <num>
#1:     Chr1   100   150  1000
#2:     Chr1   300   500  1000
#3:     Chr2   100   120  1000
#4:     Chr2   200   220  1000
#5:     Chr2   300   320  1000

Waldi
  • 39,242
  • 6
  • 30
  • 78