4

I have a simple data frame:

df <- data.frame(X = LETTERS[1:20], 
                 Y = paste0("abc_", 1:20))
#  X     Y
#1 A abc_1
#2 B abc_2
#3 C abc_3
#4 D abc_4
#5 E abc_5
#6 F abc_6
# ...

I need to assign group ID based on two vectors of integers. One indicates where the group starts, the other indicates where the group ends:

start_ix <- c(2, 5, 8, 10, 15, 18)
end_ix   <- c(4, 7, 9, 13, 17, 19)

i.e. the first group is rows 2 through 4, the second is row 5 through 7, and so on. Any row not contained in these indexes (or the span between the start and stop values) should be NA.

The desired outcome would be:

df_want <- structure(list(X = c("A", "B", "C", "D", "E", "F", "G", "H", 
"I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T"), 
    Y = c("abc_1", "abc_2", "abc_3", "abc_4", "abc_5", "abc_6", 
    "abc_7", "abc_8", "abc_9", "abc_10", "abc_11", "abc_12", 
    "abc_13", "abc_14", "abc_15", "abc_16", "abc_17", "abc_18", 
    "abc_19", "abc_20"), grp = c(NA, 1, 1, 1, 2, 2, 2, 3, 3, 
    4, 4, 4, 4, NA, 5, 5, 5, 6, 6, NA)), row.names = c(NA, -20L
), class = "data.frame")

#    X      Y grp
# 1  A  abc_1  NA
# 2  B  abc_2   1
# 3  C  abc_3   1
# 4  D  abc_4   1
# 5  E  abc_5   2
# 6  F  abc_6   2
# 7  G  abc_7   2
# 8  H  abc_8   3
# 9  I  abc_9   3
# 10 J abc_10   4
# 11 K abc_11   4
# 12 L abc_12   4
# 13 M abc_13   4
# 14 N abc_14  NA
# 15 O abc_15   5
# 16 P abc_16   5
# 17 Q abc_17   5
# 18 R abc_18   6
# 19 S abc_19   6
# 20 T abc_20  NA

The solution in my specific case would need to be done in base R, but for the sake of others who may have the same issue feel free to post solutions from external packages.

I have tried a combination of indexing, sorting, and seq but can't seem to come up with a solution.

zx8754
  • 52,746
  • 12
  • 114
  • 209
jpsmith
  • 11,023
  • 5
  • 15
  • 36

5 Answers5

3

A base method using sequence.

. <- 1 + end_ix - start_ix
df[sequence(., start_ix), "grp"] <- rep(seq_along(.), .)

Another base methode using Map.

. <- Map(seq, start_ix, end_ix)
df[unlist(.), "grp"] <- rep(seq_along(.), lengths(.))

Benchmark

bench::mark(
jpsmith = local({seqV <- Vectorize(seq.default, vectorize.args = c("to", "from"))
  ix <- unlist(seqV(start_ix, end_ix))
  df[ix, "grp"] <- rep(1:length(start_ix), (end_ix - start_ix) + 1)
  df}),
zx8754 = local({for(i in seq_along(start_ix)){
  df[ start_ix[ i ]:end_ix[ i ], "grp"] <- i}
  df}),
GKiSeq = local({. <- 1 + end_ix - start_ix
  df[sequence(., start_ix), "grp"] <- rep(seq_along(.), .)
  df}),
GKiMap = local({. <- Map(seq, start_ix, end_ix)
  df[unlist(.), "grp"] <- rep(seq_along(.), lengths(.))
  df})
)
#  expression      min  median itr/s…¹ mem_a…² gc/se…³ n_itr  n_gc total…⁴ result
#  <bch:expr> <bch:tm> <bch:t>   <dbl> <bch:b>   <dbl> <int> <dbl> <bch:t> <list>
#1 jpsmith     186.1µs 198.4µs   4944.    280B    12.3  2408     6   487ms <df>  
#2 zx8754      212.4µs 226.6µs   4339.    280B    12.4  2095     6   483ms <df>  
#3 GKiSeq       60.6µs  67.1µs  14679.    280B    12.3  7160     6   488ms <df>  
#4 GKiMap        103µs 111.6µs   8748.    280B    12.3  4265     6   488ms <df>  
GKi
  • 37,245
  • 2
  • 26
  • 48
1

While there may be better solutions, one potential solution which Vectorizes the seq function to index the rows, then uses a vector from the difference in start and end positions in rep to identify the groups:

#Index
seqV <- Vectorize(seq.default, vectorize.args = c("to", "from"))
ix <- unlist(seqV(start_ix, end_ix))

#Assign groups
df[ix, "grp"] <- rep(1:length(start_ix), (end_ix - start_ix) + 1)

# Validate
all.equal(df_want, df)
# [1] TRUE
jpsmith
  • 11,023
  • 5
  • 15
  • 36
1

Using for loop:

for(i in seq_along(start_ix)){
  df[ start_ix[ i ]:end_ix[ i ], "grp"] <- i
  }

Another option, range overlap, using data.table::foverlaps:

library(data.table)

df1 <- cbind(data.table(start = seq(nrow(df)), end = seq(nrow(df))), df)
df2 <- data.table(start = start_ix, end = end_ix, grp = seq_along(start_ix))

setkey(df1, start, end)
setkey(df2, start, end)

foverlaps(df1, df2)[, .(X, Y, group)]
#     X      Y   grp
#  1: A  abc_1    NA
#  2: B  abc_2     1
#  3: C  abc_3     1
#  4: D  abc_4     1
#  5: E  abc_5     2
#  etc...
zx8754
  • 52,746
  • 12
  • 114
  • 209
1

A tidyverse option might be:

ids_lst <- map2(start_ix, end_ix, seq)

df %>%
 mutate(grp = map_int(row_number(), 
                      function(rowid) match(TRUE, map2_lgl(rowid, ids_lst, function(rowid, ids) rowid %in% ids), nomatch = NA)))

   X      Y grp
1  A  abc_1  NA
2  B  abc_2   1
3  C  abc_3   1
4  D  abc_4   1
5  E  abc_5   2
6  F  abc_6   2
7  G  abc_7   2
8  H  abc_8   3
9  I  abc_9   3
10 J abc_10   4
11 K abc_11   4
12 L abc_12   4
13 M abc_13   4
14 N abc_14  NA
15 O abc_15   5
16 P abc_16   5
17 Q abc_17   5
18 R abc_18   6
19 S abc_19   6
20 T abc_20  NA
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
0

Fully vectorized base R solution, using indexing, findInterval, and rbind:

df$grp <- rbind(NA, 1:nrow(df))[
  findInterval(
    1:nrow(df),
    c(
      rbind(
        start_ix,
        end_ix + 0.1
      )
    )
  ) + 1L
]

Adding to the benchmark from @GKi:

bench::mark(
  jpsmith = local({seqV <- Vectorize(seq.default, vectorize.args = c("to", "from"))
  ix <- unlist(seqV(start_ix, end_ix))
  df[ix, "grp"] <- rep(1:length(start_ix), (end_ix - start_ix) + 1)
  df}),
  zx8754 = local({for(i in seq_along(start_ix)){
    df[ start_ix[ i ]:end_ix[ i ], "grp"] <- i}
    df}),
  GKiSeq = local({. <- 1 + end_ix - start_ix
  df[sequence(., start_ix), "grp"] <- rep(seq_along(.), .)
  df}),
  GKiMap = local({. <- Map(seq, start_ix, end_ix)
  df[unlist(.), "grp"] <- rep(seq_along(.), lengths(.))
  df}),
  jblood94 = local({
    df$grp <- rbind(NA, 1:nrow(df))[
      findInterval(
        1:nrow(df),
        c(
          rbind(
            start_ix,
            end_ix + 0.1
          )
        )
      ) + 1L
    ]
    df}
  )
)
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 jpsmith      93.6µs  104.2µs     9227.   46.03KB    116. 
#> 2 zx8754       95.4µs  101.9µs     9443.      280B     27.6
#> 3 GKiSeq       28.3µs   30.2µs    32045.    5.68KB     28.9
#> 4 GKiMap       56.4µs   59.7µs    16332.      280B     29.6
#> 5 jblood94     16.4µs   17.7µs    54828.   16.98KB     32.9
jblood94
  • 10,340
  • 1
  • 10
  • 15