2

Consider two tibbles data and key, given here:

library(tidyverse) # v1.3.2
set.seed(123)

data <- tibble(id = rep(LETTERS[1:10], each = 10),
               position = rep(1:10, 10),
               zip = sample(letters, 100, replace = T),
               zap = sample(letters, 100, replace = T),
               zop = sample(letters, 100, replace = T))

# A tibble: 100 × 5
   id    position zip   zap   zop  
   <chr>    <int> <chr> <chr> <chr>
 1 A            1 l     n     u    
 2 A            2 y     f     h    
 3 A            3 n     y     u    
 4 A            4 c     h     g    
 5 A            5 n     l     t    
 6 A            6 g     z     r    
 7 A            7 c     d     q    
 8 A            8 w     m     a    
 9 A            9 v     n     b    
10 A           10 z     u     q    
# … with 90 more rows
key <- tibble(id = c("A","D","H"),
              start = c(2, 5, 7),
              end = c(4, 6, 9))

# A tibble: 3 × 3
  id    start   end
  <chr> <dbl> <dbl>
1 A         2     4
2 D         5     6
3 H         7     9

And the desired output:

# A tibble: 8 × 5
  id    position zip   zap   zop  
  <chr>    <int> <chr> <chr> <chr>
1 A            2 s     u     w    
2 A            3 n     e     a    
3 A            4 c     h     h    
4 D            5 j     j     w    
5 D            6 m     e     z    
6 H            7 m     v     h    
7 H            8 e     q     w    
8 H            9 v     j     y

What's the most efficient way to subset data by id and the range of position given by key? I can think of two approaches, but neither is very fast.

1. apply() across rows of key, and bind the pieces

apply(X = key, MARGIN = 1, function(x) {
  data |>
    dplyr::filter(id == x[1],
                  position %in% x[2]:x[3])
  }
) |> dplyr::bind_rows()

2. pivot and fill key, then join()

key |> tidyr::pivot_longer(cols = c(start, end),
                           values_to = "position") |>
       dplyr::select(id, position) |>
       dplyr::group_by(id) |>
       tidyr::complete(position = seq(from = min(position),
                                      to = max(position))) |>
       dplyr::left_join(data)

What tidy approach would likely be fastest given data with millions of lines and a key with hundreds?

acvill
  • 395
  • 7
  • 15
  • 2
    See also this possible duplicate: https://stackoverflow.com/questions/24480031/overlap-join-with-start-and-end-positions. You probably want an "overlap join" – MrFlick Nov 14 '22 at 21:00

2 Answers2

3

We may do an inner_join and then slice after grouping

library(dplyr)
inner_join(data, key) %>% 
  group_by(id) %>%
  slice(first(start):first(end)) %>% 
  ungroup %>%
  select(-c(start, end))

-output

# A tibble: 8 × 5
  id    position zip   zap   zop  
  <chr>    <int> <chr> <chr> <chr>
1 A            2 s     u     w    
2 A            3 n     e     a    
3 A            4 c     h     h    
4 D            5 j     j     w    
5 D            6 m     e     z    
6 H            7 m     v     h    
7 H            8 e     q     w    
8 H            9 v     j     y    

Or another option is to make use of cur_group() after grouping by 'id' to subset the 'key' row

data %>%
   filter(id %in% key$id) %>% 
   group_by(id) %>%
   filter(row_number() >= key$start[match(cur_group()$id, key$id)], 
       row_number() <= key$end[match(cur_group()$id, key$id)] ) %>% 
   ungroup

-output

# A tibble: 8 × 5
  id    position zip   zap   zop  
  <chr>    <int> <chr> <chr> <chr>
1 A            2 s     u     w    
2 A            3 n     e     a    
3 A            4 c     h     h    
4 D            5 j     j     w    
5 D            6 m     e     z    
6 H            7 m     v     h    
7 H            8 e     q     w    
8 H            9 v     j     y  
akrun
  • 874,273
  • 37
  • 540
  • 662
  • 1
    [I did some benchmarking](https://stackoverflow.com/a/74482716/7976890) to compare your methods to mine, and it seems your `inner_join` approach is the fastest by far. – acvill Nov 18 '22 at 13:53
1

I did some benchmarking of my methods and the methods provided by akrun. Overall, it seems like the function that uses inner_join is most efficient.

Load libraries and create mock data d_ and key k_

library(tidyverse)
library(microbenchmark)
set.seed(123)
d_ <- tibble(id = rep(LETTERS[1:20], each = 1000),
               position = rep(1:1000, 20))
k_ <- tibble(id = LETTERS[1:20],
              start = as.double(sample(500,20)),
              end = start + 300)

Write different methods as functions

method1 <- function(data, key) {
  apply(X = key, MARGIN = 1, function(x) {
  data |> dplyr::filter(id == x[1],
                        position %in% x[2]:x[3])
    }
  ) |> dplyr::bind_rows()
}

method2 <- function(data, key) {
  key |> tidyr::pivot_longer(cols = c(start, end),
                             values_to = "position") |>
    dplyr::select(id, position) |>
    dplyr::group_by(id) |>
    tidyr::complete(position = seq(from = min(position),
                                   to = max(position))) |>
    dplyr::left_join(data)
}

method3 <- function(data, key) {
  dplyr::inner_join(data, key) |> 
    group_by(id) |>
    dplyr::slice(dplyr::first(start):dplyr::first(end)) |> 
    dplyr::ungroup() |>
    dplyr::select(-c(start, end))
}

method4 <- function(data, key) {
  data |>
    dplyr::filter(id %in% key$id) |> 
    dplyr::group_by(id) |>
    dplyr::filter(dplyr::row_number() >= key$start[match(dplyr::cur_group()$id,
                                                         key$id)], 
                  dplyr::row_number() <= key$end[match(dplyr::cur_group()$id,
                                                       key$id)]
                  ) |>
    dplyr::ungroup()
}

Evaluate each function 100 times with microbenchmark

mbm <- microbenchmark("acvill 1" = { method1(d_, k_) },
                      "acvill 2" = { method2(d_, k_) },
                      "akrun 1" = { method3(d_, k_) },
                      "akrun 2" = { method4(d_, k_) },
                      times = 100)

Plot benchmarking results

ggplot(data = tibble(method = mbm$expr, time = mbm$time)) +
  geom_violin(mapping = aes(x = method, y = time/10^6, fill = method)) +
  ylab("milliseconds") +
  theme_classic() +
  scale_x_discrete(limits = rev) +
  scale_y_continuous(limits = c(0,400),
                     breaks = seq(0,400,50)) +
  theme(axis.title.y = element_blank(),
        axis.text = element_text(color = "black", size = 10),
        legend.position = "none") +
  coord_flip()

benchmark violin plot

acvill
  • 395
  • 7
  • 15