2

I have a data frame as follows:

structure(list(symbol = c("u", "n", "v", "i", "a"), start = c(9L,
6L, 10L, 8L, 7L), end = c(14L, 15L, 12L, 13L, 11L)), .Names = c("symbol",
"start", "end"), class = "data.frame", row.names = c("1", "2",
"3", "4", "5"))

I want to as many rows as there are values in the range of (start, end) for each symbol. So, the final data frame will look like:

structure(list(symbol = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L), .Label = c("a", "l", "n", "v", "y"
), class = "factor"), value = c(7L, 8L, 9L, 10L, 11L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 8L, 9L, 10L, 11L, 12L, 10L,
11L, 12L, 13L, 14L, 15L, 9L, 10L, 11L, 12L, 13L)), class = "data.frame", row.names = c(NA,
-30L), .Names = c("symbol", "value"))

I was thinking I could simply have a list of values per row, and then use tidyr package's unnest as follows:

df$value <- apply(df, 1, function(x) as.list(x[2]:x[3]))
dput(df)
structure(list(symbol = structure(c(4L, 3L, 5L, 2L, 1L), .Label = c("a",
"i", "n", "u", "v"), class = "factor"), start = c(9L, 6L, 10L,
8L, 7L), end = c(14L, 15L, 12L, 13L, 11L), value = structure(list(
    `1` = list(9L, 10L, 11L, 12L, 13L, 14L), `2` = list(6L, 7L,
        8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L), `3` = list(10L,
        11L, 12L), `4` = list(8L, 9L, 10L, 11L, 12L, 13L), `5` = list(
        7L, 8L, 9L, 10L, 11L)), .Names = c("1", "2", "3", "4",
"5"))), .Names = c("symbol", "start", "end", "value"), row.names = c("1",
"2", "3", "4", "5"), class = "data.frame")

df
  symbol start end                              value
1      u     9  14              9, 10, 11, 12, 13, 14
2      n     6  15 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
3      v    10  12                         10, 11, 12
4      i     8  13               8, 9, 10, 11, 12, 13
5      a     7  11                    7, 8, 9, 10, 11

Then do:

library(tidyr)
unnest(df, value)

However, I think I am hitting this pending feature/bug: https://github.com/tidyverse/tidyr/issues/278

Error: Each column must either be a list of vectors or a list of data frames [value]

Is there a better way to do this, especially avoiding apply family?

Gopala
  • 10,363
  • 7
  • 45
  • 77

3 Answers3

4

With dplyr, we can use rowwise with do

library(dplyr)
df1 %>% 
   rowwise() %>% 
   do(data.frame(symbol= .$symbol, value = .$start:.$end)) %>% 
   arrange(symbol)
# A tibble: 30 x 2
#   symbol value
#    <chr> <int>
# 1      a     7
# 2      a     8
# 3      a     9
# 4      a    10
# 5      a    11
# 6      i     8
# 7      i     9
# 8      i    10
# 9      i    11
#10      i    12
# ... with 20 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662
  • 1
    Darn simple, huh! I just keep forgetting how much power the `do` has in a pipeline. Tried to play with a bit for this problem, but just could not come up with the right steps. Perfect. Thank you! – Gopala Oct 20 '17 at 11:27
1

You could use data.table and replicate the df by the required number of rows (based on the start and end for each symbol), then assign the value to each row after

library(data.table)

setDT(df)
df[rep(1:.N, (end - start + 1))][, value := (start - 1) + (1:.N), by = symbol][]

#    symbol start end value
# 1:      u     9  14     9
# 2:      u     9  14    10
# 3:      u     9  14    11
# 4:      u     9  14    12
# 5:      u     9  14    13
# ... etc
SymbolixAU
  • 25,502
  • 4
  • 67
  • 139
1

Perhaps you could use map2 to add a column from which we can unnest into the desired result.

library(tidyverse)
df %>% 
  mutate(value = map2(start, end, ~ seq(from = .x, to = .y))) %>%
  select(symbol, value) %>%
  unnest()
#>    symbol    value
#> 1       u        9
#> 2       u       10
#> 3       u       11
#> 4       u       12
#> 5       u       13
#> 6       u       14
#> 7       n        6
#> 8       n        7
#> 9       n        8
#> 10      n        9
#> ...etc
markdly
  • 4,394
  • 2
  • 19
  • 27