2

I have a dataframe that looks like this:

example <- data.frame(
  ID_A = c(1101, 1101, 1102, 1102, 1103, 1103),
  ID_B = c(2101, 2101, 2102, 2102, 2103, 2103),
  A = c(1,2,3,1,2,3),
  B = c(2,2,3,2,2,3),
  C = c(3,2,3,3,2,3)
)

For the selected variables of interest, I want to apply the following steps. I am essentially creating 3 new columns from the original column:

example$C.c <- scale(example$C, scale = FALSE)
example <- example %>%
  group_by(ID_A) %>%
  dplyr::mutate(., C.cb = mean(C.c, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(ID_A) %>%
  dplyr::mutate(., C.cw = C.c - C.cb) %>%
  ungroup() 

The variables of interest here are var_A, var_B and var_C. This is the desired resulting dataframe:

example_solution <- data.frame(
  ID_A = c(1101, 1101, 1102, 1102, 1103, 1103),
  ID_B = c(2101, 2101, 2102, 2102, 2103, 2103),
  A = c(1,2,3,1,2,3),
  B = c(2,2,3,2,2,3),
  C = c(3,2,3,3,2,3),
  A.c = c(-1, 0, 1, -1, 0, 1),
  A.cb = c(-0.5, -0.5, 0.0, 0.0, 0.5, 0.5),
  A.cw = c(-0.5, 0.5, 1.0, -1.0, -0.5, 0.5),
  B.c = c(-0.3333333, -0.3333333, 0.6666667, -0.3333333, -0.3333333, 0.6666667),
  B.cb = c(-0.3333333, -0.3333333, 0.1666667, 0.1666667, 0.1666667, 0.1666667),
  B.cw = c(0.0, 0.0, 0.5, -0.5, -0.5, 0.5),
  C.c = c(0.3333333, -0.6666667, 0.3333333, 0.3333333, -0.6666667, 0.3333333),
  C.cb = c(-0.1666667, -0.1666667, 0.3333333, 0.3333333, -0.1666667, -0.1666667),
  C.cw = c(0.5, -0.5, 0.0, 0.0, -0.5, 0.5)
)

How can I do this efficiently? Thank you!

Edit: Removed var_ in front of variables A, B and C.

jo_
  • 677
  • 2
  • 11

1 Answers1

2

Here's one way to do this with dplyr

Original Answer

library(dplyr)

glimpse(example)
#> Rows: 6
#> Columns: 5
#> $ ID_A  <dbl> 1101, 1101, 1102, 1102, 1103, 1103
#> $ ID_B  <dbl> 2101, 2101, 2102, 2102, 2103, 2103
#> $ var_A <dbl> 1, 2, 3, 1, 2, 3
#> $ var_B <dbl> 2, 2, 3, 2, 2, 3
#> $ var_C <dbl> 3, 2, 3, 3, 2, 3

example_solution <- example %>%
  mutate(across(starts_with('var_'),
                ~scale(., scale = FALSE)[,1],
                .names = '{.col}.c')) %>%
  group_by(ID_A) %>%
  mutate(across(ends_with('.c'),
                ~mean(., na.rm = TRUE),
                .names = '{.col}b')) %>%
  mutate(across(ends_with('.c'),
                .names = '{col}w') - 
           across(ends_with('cb'))) %>%
  ungroup()

glimpse(example_solution)
#> Rows: 6
#> Columns: 14
#> $ ID_A     <dbl> 1101, 1101, 1102, 1102, 1103, 1103
#> $ ID_B     <dbl> 2101, 2101, 2102, 2102, 2103, 2103
#> $ var_A    <dbl> 1, 2, 3, 1, 2, 3
#> $ var_B    <dbl> 2, 2, 3, 2, 2, 3
#> $ var_C    <dbl> 3, 2, 3, 3, 2, 3
#> $ var_A.c  <dbl> -1, 0, 1, -1, 0, 1
#> $ var_B.c  <dbl> -0.3333333, -0.3333333, 0.6666667, -0.3333333, -0.3333333, 0.…
#> $ var_C.c  <dbl> 0.3333333, -0.6666667, 0.3333333, 0.3333333, -0.6666667, 0.33…
#> $ var_A.cb <dbl> -0.5, -0.5, 0.0, 0.0, 0.5, 0.5
#> $ var_B.cb <dbl> -0.3333333, -0.3333333, 0.1666667, 0.1666667, 0.1666667, 0.16…
#> $ var_C.cb <dbl> -0.1666667, -0.1666667, 0.3333333, 0.3333333, -0.1666667, -0.…
#> $ var_A.cw <dbl> -0.5, 0.5, 1.0, -1.0, -0.5, 0.5
#> $ var_B.cw <dbl> 0.0, 0.0, 0.5, -0.5, -0.5, 0.5
#> $ var_C.cw <dbl> 0.5, -0.5, 0.0, 0.0, -0.5, 0.5

Using a vector of variable names

vars_list <- c('var_A',
               'var_B',
               'var_C')

solution2 <- example %>%
  mutate(across(all_of(vars_list),
                ~scale(., scale = FALSE)[,1],
                .names = '{.col}.c')) %>%
  group_by(ID_A) %>%
  mutate(across(ends_with('.c'),
                ~mean(., na.rm = TRUE),
                .names = '{.col}b')) %>%
  mutate(across(ends_with('.c'),
                .names = '{col}w') - 
           across(ends_with('cb'))) %>%
  ungroup()

glimpse(solution2)
#> Rows: 6
#> Columns: 14
#> $ ID_A     <dbl> 1101, 1101, 1102, 1102, 1103, 1103
#> $ ID_B     <dbl> 2101, 2101, 2102, 2102, 2103, 2103
#> $ var_A    <dbl> 1, 2, 3, 1, 2, 3
#> $ var_B    <dbl> 2, 2, 3, 2, 2, 3
#> $ var_C    <dbl> 3, 2, 3, 3, 2, 3
#> $ var_A.c  <dbl> -1, 0, 1, -1, 0, 1
#> $ var_B.c  <dbl> -0.3333333, -0.3333333, 0.6666667, -0.3333333, -0.3333333, 0.…
#> $ var_C.c  <dbl> 0.3333333, -0.6666667, 0.3333333, 0.3333333, -0.6666667, 0.33…
#> $ var_A.cb <dbl> -0.5, -0.5, 0.0, 0.0, 0.5, 0.5
#> $ var_B.cb <dbl> -0.3333333, -0.3333333, 0.1666667, 0.1666667, 0.1666667, 0.16…
#> $ var_C.cb <dbl> -0.1666667, -0.1666667, 0.3333333, 0.3333333, -0.1666667, -0.…
#> $ var_A.cw <dbl> -0.5, 0.5, 1.0, -1.0, -0.5, 0.5
#> $ var_B.cw <dbl> 0.0, 0.0, 0.5, -0.5, -0.5, 0.5
#> $ var_C.cw <dbl> 0.5, -0.5, 0.0, 0.0, -0.5, 0.5

Created on 2023-08-15 with reprex v2.0.2

Seth
  • 1,659
  • 1
  • 4
  • 11
  • thank you @Seth! This is a very elegant solution. However, not all my variables in the actual dataframe start with 'var_'. Is there any way I can pass a list of variables in there? Or use a loop perhaps? – jo_ Aug 15 '23 at 15:54
  • 1
    @jo_ modify this example. R is better written avoiding loops – stefan_aus_hannover Aug 15 '23 at 15:56
  • 1
    Hi @jo_ I've updated to try and address what I think you're asking. As stefan_aus_hannover notes, it would be best if this were captured in the question itself. – Seth Aug 15 '23 at 16:03
  • Yes! You got it perfectly right. I have also updated the question. Thank you very much! – jo_ Aug 15 '23 at 16:04
  • 2
    This is pretty much what my answer was going to be (but I'm slow and not great at this). I found this other question to be [a good reference](https://stackoverflow.com/questions/70096683/mutate-across-multiple-columns-to-create-new-variable-sets) – ScottyJ Aug 15 '23 at 16:11