5

I have the following input data frame:

df <- data.frame(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6))  
df
  x y  from to
1 a 4  1    2
2 b 5  2    4
3 c 6  3    6

Now I'd like to expand each row times the values between from and to namely ('a',4) spans two rows i.e. 1,2. The expected result looks like this:

exp <- data.frame(x=c('a','a','b','b','b','c','c','c','c'),
                  y=c(4,4,5,5,5,6,6,6,6),
                  z=c(1,2,2,3,4,3,4,5,6))
exp
  x y z
1 a 4 1
2 a 4 2
3 b 5 2
4 b 5 3
5 b 5 4
6 c 6 3
7 c 6 4
8 c 6 5
9 c 6 6

What's the most idiomatic way to accomplish this without loops?

SkyWalker
  • 13,729
  • 18
  • 91
  • 187

4 Answers4

8

One "non-tidyverse" way:

data.frame(
  x = c('a', 'b', 'c'),
  y = c(4, 5, 6),
  from = c(1, 2, 3),
  to = c(2, 4, 6),
  stringsAsFactors = FALSE
) -> xdf

do.call(rbind.data.frame, lapply(1:nrow(xdf), function(i) {
  data.frame(x = xdf$x[i], y=xdf$y[i], z=xdf$from[i]:xdf$to[i], stringsAsFactors=FALSE)
}))

One "tidyverse" way:

library(tidyverse)

data_frame(
  x = c('a', 'b', 'c'),
  y = c(4, 5, 6),
  from = c(1, 2, 3),
  to = c(2, 4, 6)
) -> xdf

rowwise(xdf) %>% 
  do(data_frame(x = .$x, y=.$y, z=.$from:.$to))

Another "tidyverse" way that has not been benchmarked below:

xdf %>% 
  rowwise() %>% 
  do( merge( as_tibble(.), tibble(z=.$from:.$to), by=NULL) ) %>%
  select( -from, -to )     # Omit this line if you want to keep all original columns.

Since you asked abt performance:

library(microbenchmark)

data.table::data.table(
  x = c('a','b','c'),
  y = c(4,5,6),
  from = c(1,2,3),
  to = c(2,4,6)
) -> xdt1

data.frame(
  x = c('a', 'b', 'c'),
  y = c(4, 5, 6),
  from = c(1, 2, 3),
  to = c(2, 4, 6),
  stringsAsFactors = FALSE
) -> xdf1 

data.table ops often modify in-place so keep a level playing field and make a copy of each data frame/table before doing the op.

That time penalty is ~100 nanoseconds on most modern systems.

microbenchmark(

  data.table = {
    xdt2 <- xdt1
    xdt2[, diff:= (to - from) + 1]
    xdt2 <- xdt2[rep(1:.N, diff)]
    xdt2[,z := seq(from,to), by=.(x,y,from,to)]
    xdt2[,c("x", "y", "z")]
  }, 

  base = {
    xdf2 <- xdf1
    do.call(rbind.data.frame, lapply(1:nrow(xdf2), function(i) {
      data.frame(x = xdf2$x[i], y=xdf2$y[i], z=xdf2$from[i]:xdf2$to[i], stringsAsFactors=FALSE)
    }))
  }, 

  tidyverse = {
    xdf2 <- xdf1
    dplyr::rowwise(xdf2) %>% 
      dplyr::do(dplyr::data_frame(x = .$x, y=.$y, z=.$from:.$to))
  }, 

  plyr = {
    xdf2 <- xdf1
    plyr::mdply(xdf2, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
  },

  times = 1000

)
## Unit: microseconds
##        expr       min         lq       mean    median         uq        max neval
##  data.table   920.361  1072.9265  1257.2321  1178.832  1280.2660  10628.552  1000
##        base   677.069   761.3145   884.4136   825.472   915.8985   5366.515  1000
##   tidyverse 15926.127 17231.5015 19201.4798 17994.919 20014.4140 166901.570  1000
##        plyr  1938.838  2196.4205  2448.5314  2322.949  2501.5075   5735.255  1000
Kalin
  • 1,691
  • 2
  • 16
  • 22
hrbrmstr
  • 77,368
  • 11
  • 139
  • 205
  • For the "tidyverse" way, you kept columns by explicit naming (`x = .$x, y=.$y`) and added a new one (`z=.$from:.$to`). Do you know how one would keep all existing columns and append the new column `z` without explicit naming of the ones to keep? That is, act like `mutate` for adding a column, but repeat rows when the new variable is that vector... Thanks for your help! – Kalin Dec 07 '18 at 19:57
  • This almost does it... `dplyr::starwars[1:2,1] %>% rowwise() %>% do( expand.grid( ., z = 1:2 ))` except I get `list` type for the first column that has a bunch of lists of length 1... – Kalin Dec 07 '18 at 20:10
2

You can use data.table

library(data.table)    
df <- data.table(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6))  
df <- df[, diff:= (to - from) + 1]

df <- df[rep(1:.N,diff)]
df <- df[,z := seq(from,to) , by=.(x,y,from,to)]
df

> df
   x y from to diff z
1: a 4    1  2    2 1
2: a 4    1  2    2 2
3: b 5    2  4    3 2
4: b 5    2  4    3 3
5: b 5    2  4    3 4
6: c 6    3  6    4 3
7: c 6    3  6    4 4
8: c 6    3  6    4 5
9: c 6    3  6    4 6
Hardik Gupta
  • 4,700
  • 9
  • 41
  • 83
2

I realize this has already been answered, but a one-liner data.table solution is:

library(data.table)
setDT(df)[,.(z = from:to), by = .(x,y)]

#   x y z
#1: a 4 1
#2: a 4 2
#3: b 5 2
#4: b 5 3
#5: b 5 4
#6: c 6 3
#7: c 6 4
#8: c 6 5
#9: c 6 6
Mike H.
  • 13,960
  • 2
  • 29
  • 39
0

OK using the plyr package there is a neat solution using mdply:

library(plyr)
df <- data.frame(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6)) 
res <- mdply(df, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
res
  x y z
1 a 4 1
2 a 4 2
3 b 5 2
4 b 5 3
5 b 5 4
6 c 6 3
7 c 6 4
8 c 6 5
9 c 6 6

Since it creates a data frame for every row maybe it is not super efficient ... or?

SkyWalker
  • 13,729
  • 18
  • 91
  • 187