2

I have a dataframe containing groups. One column ntimes indicates, how many times the group should be repeated, keeping same row order. I want extend my dataframe by groups of rows, repeated several times. Howevere, I am not sure how to make it in efficient way?

My example:

  # Repeat rows by group
set.seed(5)
df <- data.frame(my.group = rep(c("a", "b", "z"), each = 3),
                 vals = runif(9),
                 ntimes = c(3,3,3, 1,1,1,2,2,2))

  my.group      vals ntimes
1        a 0.1104530      3
2        a 0.2732849      3
3        a 0.4905132      3
4        b 0.3184040      1
5        b 0.5591728      1
6        b 0.2625931      1
7        z 0.2018752      2
8        z 0.3875257      2
9        z 0.8878698      2

Group a should be repeated 3 times, group b one time, group z 3 times.

Expected output:

  my.group      vals ntimes
1        a 0.1104530      3
2        a 0.2732849      3
3        a 0.4905132      3
4        a 0.1104530      3
5        a 0.2732849      3
6        a 0.4905132      3
7        a 0.1104530      3
8        a 0.2732849      3
9        a 0.4905132      3
10       b 0.3184040      1
11       b 0.5591728      1
12       b 0.2625931      1
13       z 0.2018752      2
14       z 0.3875257      2
15       z 0.8878698      2
16       z 0.2018752      2
17       z 0.3875257      2
18       z 0.8878698      2

I have tried several approaches using dplyr, but neither of them produces desired outputs:

# repeat df rows by group
library(dplyr)
df %>% 
  group_by(my.group) %>% 
  mutate(new = rep(seq_len(n()/2), each = 2, length.out = n()))


df %>% 
  group_by(my.group) %>% 
  slice(rep(1:n(), each = 2))


df %>% 
  group_by(my.group) %>% 
  mutate(count = c(3,1,2)) %>% 
  expand(ntimes = seq(1:ntimes))

I highly appreciate any suggestions.

maycca
  • 3,848
  • 5
  • 36
  • 67

2 Answers2

2

Give only one value to times argument in rep. Since you want to do this by group you can use any value from ntimes column.

library(dplyr)
df %>% group_by(my.group) %>% slice(rep(1:n(), first(ntimes)))
#Similar other variations could be
#df %>% group_by(my.group) %>% slice(rep(seq_len(n()), first(ntimes)))
#df %>% group_by(my.group) %>% slice(rep(seq_along(ntimes), first(ntimes)))

#  my.group  vals ntimes
#  <fct>    <dbl>  <int>
# 1 a        0.110      3
# 2 a        0.273      3
# 3 a        0.491      3
# 4 a        0.110      3
# 5 a        0.273      3
# 6 a        0.491      3
# 7 a        0.110      3
# 8 a        0.273      3
# 9 a        0.491      3
#10 b        0.318      1
#11 b        0.559      1
#12 b        0.263      1
#13 z        0.202      2
#14 z        0.388      2
#15 z        0.888      2
#16 z        0.202      2
#17 z        0.388      2
#18 z        0.888      2

Doing this in base R is surprisingly convulated or maybe there is a way which I can't figure out

df[unlist(Map(rep, split(1:nrow(df), df$my.group), 
                   tapply(df$ntimes, df$my.group, head, 1))), ]

data

df <- structure(list(my.group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L, 3L), .Label = c("a", "b", "z"), class = "factor"), vals = c(0.110453, 
0.2732849, 0.4905132, 0.318404, 0.5591728, 0.2625931, 0.2018752, 
0.3875257, 0.8878698), ntimes = c(3L, 3L, 3L, 1L, 1L, 1L, 2L, 
2L, 2L)), class = "data.frame", row.names = c("1", "2", "3", 
"4", "5", "6", "7", "8", "9"))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
  • 1
    Other base R solution if doing this by group is necessary: `r <- with(df, ave(ntimes, my.group, FUN = function(x) rep(x[1], length(x)))); df[rep(1:nrow(df), r),]` – Jaap Jan 15 '20 at 10:42
  • 1
    @Jaap I don't think that keeps the row order which I think is important for OP as mentioned in the post. Hence, I don't agree with the duplicate as well. – Ronak Shah Jan 15 '20 at 11:25
0

tidyr::uncount is exactly what you're looking for

set.seed(5)
df <- data.frame(my.group = rep(c("a", "b", "z"), each = 3),
                 vals = runif(9),
                 ntimes = c(3,3,3, 1,1,1,2,2,2))
df
#>   my.group      vals ntimes
#> 1        a 0.2002145      3
#> 2        a 0.6852186      3
#> 3        a 0.9168758      3
#> 4        b 0.2843995      1
#> 5        b 0.1046501      1
#> 6        b 0.7010575      1
#> 7        z 0.5279600      2
#> 8        z 0.8079352      2
#> 9        z 0.9565001      2
df |> tidyr::uncount(ntimes, .remove = FALSE)
#>    my.group      vals ntimes
#> 1         a 0.2002145      3
#> 2         a 0.2002145      3
#> 3         a 0.2002145      3
#> 4         a 0.6852186      3
#> 5         a 0.6852186      3
#> 6         a 0.6852186      3
#> 7         a 0.9168758      3
#> 8         a 0.9168758      3
#> 9         a 0.9168758      3
#> 10        b 0.2843995      1
#> 11        b 0.1046501      1
#> 12        b 0.7010575      1
#> 13        z 0.5279600      2
#> 14        z 0.5279600      2
#> 15        z 0.8079352      2
#> 16        z 0.8079352      2
#> 17        z 0.9565001      2
#> 18        z 0.9565001      2

Created on 2023-01-11 with reprex v2.0.2

Arthur Yip
  • 5,810
  • 2
  • 31
  • 50