I'm looking for a neater tidyverse method for:
- consolidating multiple sets of columns with the same naming structure (alpha1, beta1, etc.) into columns (OUT1, OUT2, etc.) when the amount of columns in each group varies and
- creating an index variable to indicate which group it came from when
I've been able to find [this][1] nifty gather/ spread solution by hadley - unfortunately, the gather() package appears to have been superceded by pivot_longer(), which I'm not as familiar with. Does anyone have a cleaner way of processing this code? I've been able to use coalesce() by specifying the columns of interest, but would like to make my code more flexible for broader applications.
library(reprex)
library(tidyverse)
set.seed(142)
df <- data.frame(
ID = floor(runif(10, min = 3000, max = 3500)),
alpha1 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)),
alpha2 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)),
alpha3 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)),
beta1 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
beta2 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
beta3 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
beta4 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
beta5 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
beta6 = c(NA, runif(3,min = 0, max = 10), rep(NA, 3), runif(2,min = 0, max = 10), NA),
gamma1 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)),
gamma2 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)),
gamma3 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)),
gamma4 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)),
epsilon1 = c(rep(NA, 9), runif(1,min = 0, max = 10)),
epsilon2 = c(rep(NA, 9), runif(1,min = 0, max = 10)),
epsilon3 = c(rep(NA, 9), runif(1,min = 0, max = 10)),
epsilon4 = c(rep(NA, 9), runif(1,min = 0, max = 10)),
epsilon5 = c(rep(NA, 9), runif(1,min = 0, max = 10))
)
print(df)
#> ID alpha1 alpha2 alpha3 beta1 beta2 beta3 beta4
#> 1 3447 8.565120 5.608665 1.469363 NA NA NA NA
#> 2 3349 NA NA NA 1.7662342 9.024467 3.359498 5.5526906
#> 3 3477 NA NA NA 9.3000108 4.527420 6.349884 3.2607585
#> 4 3280 NA NA NA 9.3731781 4.451956 8.666138 0.8562113
#> 5 3405 NA NA NA NA NA NA NA
#> 6 3433 8.814155 5.152528 5.361044 NA NA NA NA
#> 7 3140 7.637371 5.560464 5.840688 NA NA NA NA
#> 8 3341 NA NA NA 0.5090031 8.388378 6.693092 9.6454022
#> 9 3216 NA NA NA 4.5328798 6.902459 3.768740 7.2237054
#> 10 3216 NA NA NA NA NA NA NA
#> beta5 beta6 gamma1 gamma2 gamma3 gamma4 epsilon1 epsilon2
#> 1 NA NA NA NA NA NA NA NA
#> 2 8.1177462 3.547035 NA NA NA NA NA NA
#> 3 0.5366714 6.176950 NA NA NA NA NA NA
#> 4 2.5554633 8.778866 NA NA NA NA NA NA
#> 5 NA NA 6.896772 0.6065131 1.021765 3.835954 NA NA
#> 6 NA NA NA NA NA NA NA NA
#> 7 NA NA NA NA NA NA NA NA
#> 8 1.2867988 1.945821 NA NA NA NA NA NA
#> 9 3.1742775 6.589169 NA NA NA NA NA NA
#> 10 NA NA NA NA NA NA 1.782508 9.237419
#> epsilon3 epsilon4 epsilon5
#> 1 NA NA NA
#> 2 NA NA NA
#> 3 NA NA NA
#> 4 NA NA NA
#> 5 NA NA NA
#> 6 NA NA NA
#> 7 NA NA NA
#> 8 NA NA NA
#> 9 NA NA NA
#> 10 5.582417 8.268571 3.851855
out <- df %>%
# create consolidated variables for each category
mutate(OUT1 = coalesce(alpha1, beta1, gamma1, epsilon1),
OUT2 = coalesce(alpha2, beta2, gamma2, epsilon2),
OUT3 = coalesce(alpha3, beta3, gamma3, epsilon3),
OUT4 = coalesce(beta4, gamma4, epsilon4),
OUT5 = coalesce(beta5, epsilon5),
OUT6 = beta6,
# create index to indicate which group it came from
index = case_when(
!is.na(alpha1) ~ "alpha",
!is.na(beta1) ~ "beta",
!is.na(gamma1) ~ "gamma",
!is.na(epsilon1) ~ "epsilon"
)) %>%
select(ID, index, starts_with("OUT"))
print(out)
#> ID index OUT1 OUT2 OUT3 OUT4 OUT5 OUT6
#> 1 3447 alpha 8.5651199 5.6086651 1.469363 NA NA NA
#> 2 3349 beta 1.7662342 9.0244668 3.359498 5.5526906 8.1177462 3.547035
#> 3 3477 beta 9.3000108 4.5274197 6.349884 3.2607585 0.5366714 6.176950
#> 4 3280 beta 9.3731781 4.4519563 8.666138 0.8562113 2.5554633 8.778866
#> 5 3405 gamma 6.8967720 0.6065131 1.021765 3.8359539 NA NA
#> 6 3433 alpha 8.8141546 5.1525276 5.361044 NA NA NA
#> 7 3140 alpha 7.6373710 5.5604635 5.840688 NA NA NA
#> 8 3341 beta 0.5090031 8.3883777 6.693092 9.6454022 1.2867988 1.945821
#> 9 3216 beta 4.5328798 6.9024590 3.768740 7.2237054 3.1742775 6.589169
#> 10 3216 epsilon 1.7825080 9.2374190 5.582417 8.2685712 3.8518549
[1]: https://stackoverflow.com/questions/25925556/gather-multiple-sets-of-columns/25932131#25932131