4

I have a data set with nested groups and with some rows missing:

set.seed(123)
df <- data.frame(Gr1 = rep(c("x", "y"), each = 10),
                 Gr2 = rep(c("x1", "x2", "y1", "y2"), each = 5),
                 ID = paste0(rep(c("x", "y"), each = 10), letters[1:5]),
                 var1 = round(rnorm(20), 2),
                 var2 = round(rnorm(20), 2))

rmv.rows <- sample(1:20, 5)
df <- df[-rmv.rows, ]

   Gr1 Gr2 ID  var1  var2
1    x  x1 xa -0.56 -1.07
3    x  x1 xc  1.56 -1.03
4    x  x1 xd  0.07 -0.73
6    x  x2 xa  1.72 -1.69
7    x  x2 xb  0.46  0.84
9    x  x2 xd -0.69 -1.14
10   x  x2 xe -0.45  1.25
11   y  y1 ya  1.22  0.43
12   y  y1 yb  0.36 -0.30
15   y  y1 ye -0.56  0.82
16   y  y2 ya  1.79  0.69
17   y  y2 yb  0.50  0.55
18   y  y2 yc -1.97 -0.06
19   y  y2 yd  0.70 -0.31
20   y  y2 ye -0.47 -0.38

I would like to fill missing rows (i.e. combinations of Gr1, Gr2 and ID) by zeros.

I tried approaches as suggested here, however it returns all possible combinations of Gr1, Gr2 and ID and not those present in data. In other words, I would like to insert only existing combinations of Gr1, Gr2 and ID. The desired output should be:

   Gr1 Gr2 ID  var1  var2
1    x  x1 xa -0.56 -1.07
2    x  x1 xb  0.00  0.00
3    x  x1 xc  1.56 -1.03
4    x  x1 xd  0.07 -0.73
5    x  x1 xe  0.00  0.00
6    x  x2 xa  1.72 -1.69
7    x  x2 xb  0.46  0.84
8    x  x2 xc  0.00  0.00
9    x  x2 xd -0.69 -1.14
10   x  x2 xe -0.45  1.25
11   y  y1 ya  1.22  0.43
12   y  y1 yb  0.36 -0.30
13   y  y1 yc  0.00  0.00
14   y  y1 yd  0.00  0.00
15   y  y1 ye -0.56  0.82
16   y  y2 ya  1.79  0.69
17   y  y2 yb  0.50  0.55
18   y  y2 yc -1.97 -0.06
19   y  y2 yd  0.70 -0.31
20   y  y2 ye -0.47 -0.38
acylam
  • 18,231
  • 5
  • 36
  • 45
Adela
  • 1,757
  • 19
  • 37

3 Answers3

4

Here is an option that uses data.table:

library(data.table)
setDT(df)
all_comb <- df[, CJ(Gr2, ID, unique = TRUE), by = Gr1]
df_out <- df[all_comb, on = .(Gr1, Gr2, ID)]
df_out[is.na(df_out)] <- 0
df_out

#     Gr1 Gr2 ID  var1  var2
#  1:   x  x1 xa -0.56 -1.07
#  2:   x  x1 xb -0.23 -0.22
#  3:   x  x1 xc  1.56 -1.03
#  4:   x  x1 xd  0.07 -0.73
#  5:   x  x1 xe  0.13 -0.63
#  6:   x  x2 xa  0.00  0.00
#  7:   x  x2 xb  0.00  0.00
#  8:   x  x2 xc  0.00  0.00
#  9:   x  x2 xd -0.69 -1.14
# 10:   x  x2 xe -0.45  1.25
# 11:   y  y1 ya  0.00  0.00
# 12:   y  y1 yb  0.36 -0.30
# 13:   y  y1 yc  0.40  0.90
# 14:   y  y1 yd  0.11  0.88
# 15:   y  y1 ye  0.00  0.00
# 16:   y  y2 ya  1.79  0.69
# 17:   y  y2 yb  0.50  0.55
# 18:   y  y2 yc -1.97 -0.06
# 19:   y  y2 yd  0.70 -0.31
# 20:   y  y2 ye -0.47 -0.38

PS.

For users that have not yet updated to R 3.6 here is the data produced by OP code in the current version of R:

df <- structure(list(Gr1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", "y"), class = "factor"), 
    Gr2 = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 
    4L, 4L, 4L, 4L, 4L), .Label = c("x1", "x2", "y1", "y2"), class = "factor"), 
    ID = structure(c(1L, 2L, 3L, 4L, 5L, 4L, 5L, 7L, 8L, 9L, 
    6L, 7L, 8L, 9L, 10L), .Label = c("xa", "xb", "xc", "xd", 
    "xe", "ya", "yb", "yc", "yd", "ye"), class = "factor"), var1 = c(-0.56, 
    -0.23, 1.56, 0.07, 0.13, -0.69, -0.45, 0.36, 0.4, 0.11, 1.79, 
    0.5, -1.97, 0.7, -0.47), var2 = c(-1.07, -0.22, -1.03, -0.73, 
    -0.63, -1.14, 1.25, -0.3, 0.9, 0.88, 0.69, 0.55, -0.06, -0.31, 
    -0.38)), row.names = c(1L, 2L, 3L, 4L, 5L, 9L, 10L, 12L, 
13L, 14L, 16L, 17L, 18L, 19L, 20L), class = "data.frame")
s_baldur
  • 29,441
  • 4
  • 36
  • 69
  • @avid_useR I think the issue is that the printout shown by OP of `df` is not correct. – s_baldur Jun 04 '19 at 15:24
  • @avid_useR shouldn't be done the other way around, that is: the code for the reproducible data is what we should go by? – s_baldur Jun 04 '19 at 15:31
  • @avid_useR wierd, I don't (using R 3.6), could it be that you are using an older version? – s_baldur Jun 04 '19 at 15:36
  • Of course I did. Just checked and sample() --- it did effectively get updated in 3.6: https://stat.ethz.ch/pipermail/r-announce/2019/000641.html – s_baldur Jun 04 '19 at 15:37
  • @avid_useR ARE you sure you are not using an old version of R? – s_baldur Jun 04 '19 at 15:40
  • I am, but so is OP. Since you have a newer/different version of R installed, you should provide the data you're using if you can't reproduce what OP has provided. – acylam Jun 04 '19 at 15:42
  • @avid_useR I don't know what the issue is. I am using the current version (R version 3.6.0, released on 2019-04-26.) and you are using an old version. – s_baldur Jun 04 '19 at 15:44
  • The issue being readers would likely get confused that your output doesn't match the input and there is no way to get your output if they have an older version. It's good now that you've provided reproducible data. – acylam Jun 04 '19 at 17:14
2

We can do it with complete and nesting from tidyr:

library(dplyr)
library(tidyr)

df %>% 
  group_by(Gr1) %>%
  complete(nesting(ID), nesting(Gr2), fill = list(var1 = 0, var2 = 0)) %>%
  arrange(Gr1, Gr2, ID) %>%
  select(Gr1, Gr2, ID, everything())

Output:

# A tibble: 20 x 5
# Groups:   Gr1 [2]
   Gr1   Gr2   ID     var1  var2
   <fct> <fct> <fct> <dbl> <dbl>
 1 x     x1    xa    -0.56 -1.07
 2 x     x1    xb     0     0   
 3 x     x1    xc     1.56 -1.03
 4 x     x1    xd     0.07 -0.73
 5 x     x1    xe     0     0   
 6 x     x2    xa     1.72 -1.69
 7 x     x2    xb     0.46  0.84
 8 x     x2    xc     0     0   
 9 x     x2    xd    -0.69 -1.14
10 x     x2    xe    -0.45  1.25
11 y     y1    ya     1.22  0.43
12 y     y1    yb     0.36 -0.3 
13 y     y1    yc     0     0   
14 y     y1    yd     0     0   
15 y     y1    ye    -0.56  0.82
16 y     y2    ya     1.79  0.69
17 y     y2    yb     0.5   0.55
18 y     y2    yc    -1.97 -0.06
19 y     y2    yd     0.7  -0.31
20 y     y2    ye    -0.47 -0.38
acylam
  • 18,231
  • 5
  • 36
  • 45
1

An option with expand and left_join

library(tidyverse)
df %>% 
  group_by(Gr1) %>%
  expand(nesting(ID), nesting(Gr2)) %>% 
  left_join(df) %>%
  mutate_at(vars(var1:var2), replace_na, 0)  %>% 
  arrange(Gr1, Gr2, ID) %>% 
  select(names(df))
# A tibble: 20 x 5
# Groups:   Gr1 [2]
#   Gr1   Gr2   ID     var1  var2
#   <chr> <chr> <chr> <dbl> <dbl>
# 1 x     x1    xa    -0.56 -1.07
# 2 x     x1    xb     0     0   
# 3 x     x1    xc     1.56 -1.03
# 4 x     x1    xd     0.07 -0.73
# 5 x     x1    xe     0     0   
# 6 x     x2    xa     1.72 -1.69
# 7 x     x2    xb     0.46  0.84
# 8 x     x2    xc     0     0   
# 9 x     x2    xd    -0.69 -1.14
#10 x     x2    xe    -0.45  1.25
#11 y     y1    ya     1.22  0.43
#12 y     y1    yb     0.36 -0.3 
#13 y     y1    yc     0     0   
#14 y     y1    yd     0     0   
#15 y     y1    ye    -0.56  0.82
#16 y     y2    ya     1.79  0.69
#17 y     y2    yb     0.5   0.55
#18 y     y2    yc    -1.97 -0.06
#19 y     y2    yd     0.7  -0.31
#20 y     y2    ye    -0.47 -0.38
akrun
  • 874,273
  • 37
  • 540
  • 662