0

I'm looking for a neater tidyverse method for:

  1. consolidating multiple sets of columns with the same naming structure (alpha1, beta1, etc.) into columns (OUT1, OUT2, etc.) when the amount of columns in each group varies and
  2. creating an index variable to indicate which group it came from when

I've been able to find [this][1] nifty gather/ spread solution by hadley - unfortunately, the gather() package appears to have been superceded by pivot_longer(), which I'm not as familiar with. Does anyone have a cleaner way of processing this code? I've been able to use coalesce() by specifying the columns of interest, but would like to make my code more flexible for broader applications.

library(reprex)
library(tidyverse)

set.seed(142)
df <- data.frame(
      ID = floor(runif(10, min = 3000, max = 3500)), 
      alpha1 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)), 
      alpha2 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)), 
      alpha3 = c(runif(1,min = 0, max = 10), rep(NA, 4),runif(2,min = 0, max = 10), rep(NA, 3)),
      beta1 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      beta2 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      beta3 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      beta4 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      beta5 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      beta6 = c(NA, runif(3,min = 0, max = 10),  rep(NA, 3), runif(2,min = 0, max = 10), NA), 
      gamma1 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)), 
      gamma2 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)), 
      gamma3 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)), 
      gamma4 = c(rep(NA, 4), runif(1,min = 0, max = 10), rep(NA, 5)), 
      epsilon1 = c(rep(NA, 9), runif(1,min = 0, max = 10)), 
      epsilon2 = c(rep(NA, 9), runif(1,min = 0, max = 10)), 
      epsilon3 = c(rep(NA, 9), runif(1,min = 0, max = 10)), 
      epsilon4 = c(rep(NA, 9), runif(1,min = 0, max = 10)), 
      epsilon5 = c(rep(NA, 9), runif(1,min = 0, max = 10))
      )

print(df) 
#>      ID   alpha1   alpha2   alpha3     beta1    beta2    beta3     beta4
#> 1  3447 8.565120 5.608665 1.469363        NA       NA       NA        NA
#> 2  3349       NA       NA       NA 1.7662342 9.024467 3.359498 5.5526906
#> 3  3477       NA       NA       NA 9.3000108 4.527420 6.349884 3.2607585
#> 4  3280       NA       NA       NA 9.3731781 4.451956 8.666138 0.8562113
#> 5  3405       NA       NA       NA        NA       NA       NA        NA
#> 6  3433 8.814155 5.152528 5.361044        NA       NA       NA        NA
#> 7  3140 7.637371 5.560464 5.840688        NA       NA       NA        NA
#> 8  3341       NA       NA       NA 0.5090031 8.388378 6.693092 9.6454022
#> 9  3216       NA       NA       NA 4.5328798 6.902459 3.768740 7.2237054
#> 10 3216       NA       NA       NA        NA       NA       NA        NA
#>        beta5    beta6   gamma1    gamma2   gamma3   gamma4 epsilon1 epsilon2
#> 1         NA       NA       NA        NA       NA       NA       NA       NA
#> 2  8.1177462 3.547035       NA        NA       NA       NA       NA       NA
#> 3  0.5366714 6.176950       NA        NA       NA       NA       NA       NA
#> 4  2.5554633 8.778866       NA        NA       NA       NA       NA       NA
#> 5         NA       NA 6.896772 0.6065131 1.021765 3.835954       NA       NA
#> 6         NA       NA       NA        NA       NA       NA       NA       NA
#> 7         NA       NA       NA        NA       NA       NA       NA       NA
#> 8  1.2867988 1.945821       NA        NA       NA       NA       NA       NA
#> 9  3.1742775 6.589169       NA        NA       NA       NA       NA       NA
#> 10        NA       NA       NA        NA       NA       NA 1.782508 9.237419
#>    epsilon3 epsilon4 epsilon5
#> 1        NA       NA       NA
#> 2        NA       NA       NA
#> 3        NA       NA       NA
#> 4        NA       NA       NA
#> 5        NA       NA       NA
#> 6        NA       NA       NA
#> 7        NA       NA       NA
#> 8        NA       NA       NA
#> 9        NA       NA       NA
#> 10 5.582417 8.268571 3.851855

out <- df %>% 
  # create consolidated variables for each category  
    mutate(OUT1 = coalesce(alpha1, beta1, gamma1, epsilon1), 
           OUT2 = coalesce(alpha2, beta2, gamma2, epsilon2), 
           OUT3 = coalesce(alpha3, beta3, gamma3, epsilon3),
           OUT4 = coalesce(beta4, gamma4, epsilon4),
           OUT5 = coalesce(beta5, epsilon5), 
           OUT6 = beta6, 
          # create index to indicate which group it came from 
           index = case_when(
            !is.na(alpha1) ~ "alpha", 
            !is.na(beta1) ~ "beta", 
            !is.na(gamma1) ~ "gamma", 
            !is.na(epsilon1) ~ "epsilon"
           )) %>% 
    select(ID, index, starts_with("OUT"))
           
print(out)
#>      ID   index      OUT1      OUT2     OUT3      OUT4      OUT5     OUT6
#> 1  3447   alpha 8.5651199 5.6086651 1.469363        NA        NA       NA
#> 2  3349    beta 1.7662342 9.0244668 3.359498 5.5526906 8.1177462 3.547035
#> 3  3477    beta 9.3000108 4.5274197 6.349884 3.2607585 0.5366714 6.176950
#> 4  3280    beta 9.3731781 4.4519563 8.666138 0.8562113 2.5554633 8.778866
#> 5  3405   gamma 6.8967720 0.6065131 1.021765 3.8359539        NA       NA
#> 6  3433   alpha 8.8141546 5.1525276 5.361044        NA        NA       NA
#> 7  3140   alpha 7.6373710 5.5604635 5.840688        NA        NA       NA
#> 8  3341    beta 0.5090031 8.3883777 6.693092 9.6454022 1.2867988 1.945821
#> 9  3216    beta 4.5328798 6.9024590 3.768740 7.2237054 3.1742775 6.589169
#> 10 3216 epsilon 1.7825080 9.2374190 5.582417 8.2685712 3.8518549 


  [1]: https://stackoverflow.com/questions/25925556/gather-multiple-sets-of-columns/25932131#25932131
M--
  • 25,431
  • 8
  • 61
  • 93

0 Answers0