2

I have a dataframe with several columns, and I create a new column which randomly samples a single value from either of the other columns. How can I trace back to tell which column the value came from?

I've seen the exact same question and solution here, but it's in python, and couldn't find an R equivalent.

Data 1 :: each row has different values across columns

df_uniques <-
  data.frame(
    col_a = c(2, 2, 5, 5, 3),
    col_b = c(NA, 4, 2, 3, 1),
    col_c = c(4, 5, 3, 1, 2),
    col_d = c(1, NA, 4, 2, 4),
    col_e = c(3, 3, 1, 4, 5)
  )

> df_uniques

##   col_a col_b col_c col_d col_e
## 1     2    NA     4     1     3
## 2     2     4     5    NA     3
## 3     5     2     3     4     1
## 4     5     3     1     2     4
## 5     3     1     2     4     5

Mutate a new column to sample from either previous columns

library(dplyr)

set.seed(2020)

df_uniques %>%
  rowwise() %>%
  mutate(sampled = sample(c(col_a, col_b, col_c, col_d, col_e), size = n()))

## # A tibble: 5 x 6
## # Rowwise: 
##   col_a col_b col_c col_d col_e sampled
##   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1     2    NA     4     1     3       1
## 2     2     4     5    NA     3      NA
## 3     5     2     3     4     1       5
## 4     5     3     1     2     4       5
## 5     3     1     2     4     5       4

Data 2 :: each row has duplicating values across columns

df_duplicates <-
  data.frame(
    col_a = c(1, 4, 2, 5, 2),
    col_b = c(NA, 4, NA, 3, 1),
    col_c = c(4, NA, 5, NA, NA),
    col_d = c(1, NA, NA, 2, NA),
    col_e = c(2, 3, NA, NA, 5)
  )

> df_duplicates
##   col_a col_b col_c col_d col_e
## 1     1    NA     4     1     2     
## 2     4     4    NA    NA     3
## 3     2    NA     5    NA    NA
## 4     5     3    NA     2    NA
## 5     2     1    NA    NA     5

Mutate a new column to sample from either previous columns

set.seed(2020)

df_duplicates %>%
  rowwise() %>%
  mutate(sampled = sample(c(col_a, col_b, col_c, col_d, col_e), size = n()))

## # A tibble: 5 x 6
## # Rowwise: 
##   col_a col_b col_c col_d col_e sampled
##   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1     1    NA     4     1     2      NA
## 2     4     4    NA    NA     3       4
## 3     2    NA     5    NA    NA      NA
## 4     5     3    NA     2    NA       3
## 5     2     1    NA    NA     5       1

Tracing back: which column is the origin of sampled?

Desired Output (Data 1 :: uniques)

# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>     
1     2    NA     4     1     3       1 col_d     
2     2     4     5    NA     3      NA col_d     
3     5     2     3     4     1       5 col_a     
4     5     3     1     2     4       5 col_a     
5     3     1     2     4     5       4 col_d 

Desired Output (Data 2 :: duplicates)

# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col   
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>        
1     1    NA     4     1     2       1 col_a, col_d
2     4     4    NA    NA     3      NA col_c, col_d 
3     2    NA     5    NA    NA       2 col_a        
4     5     3    NA     2    NA       5 col_a        
5     2     1    NA    NA     5      NA col_c, col_d 

Emman
  • 3,695
  • 2
  • 20
  • 44

2 Answers2

1

One option could be:

df_duplicates %>%
 rowwise() %>%
 mutate(sampled = sample(c_across(col_a:col_e), size = n()),
        origin_col = if(is.na(sampled)) toString(names(.)[which(is.na(c_across(col_a:col_e)))]) else toString(names(.)[which(c_across(col_a:col_e) == sampled)]))
     
  col_a col_b col_c col_d col_e sampled origin_col         
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>              
1     1    NA     4     1     2       1 col_a, col_d       
2     4     4    NA    NA     3       4 col_a, col_b       
3     2    NA     5    NA    NA      NA col_b, col_d, col_e
4     5     3    NA     2    NA      NA col_c, col_e       
5     2     1    NA    NA     5       2 col_a  
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
1

Are you looking for something like this?

cols <- c("col_a", "col_b", "col_c", "col_d", "col_e")
workflow <- 
  . %>%
  rowwise() %>%
  mutate(
    sampled = sample(c_across(!!cols), 1L), 
    origin_col = toString(cols[which(c_across(!!cols) %in% sampled)])
  )

Output

> set.seed(2020L); workflow(df_uniques)
# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>       
1     2    NA     4     1     3       1 col_d       
2     2     4     5    NA     3      NA col_d       
3     5     2     3     4     1       5 col_a       
4     5     3     1     2     4       5 col_a       
5     3     1     2     4     5       4 col_d       

> set.seed(2020L); workflow(df_duplicates)
# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>       
1     1    NA     4     1     2       1 col_a, col_d
2     4     4    NA    NA     3      NA col_c, col_d
3     2    NA     5    NA    NA       2 col_a       
4     5     3    NA     2    NA       5 col_a       
5     2     1    NA    NA     5      NA col_c, col_d

Method 1: create a temporary variable for your selected columns

workflow <- 
  . %>%
  rowwise() %>%
  mutate(
    d = across(starts_with("col_")),
    sampled = sample(c_across(names(d)), 1L),
    original_col = toString(names(d)[which(c_across(names(d)) %in% sampled)]), 
    d = NULL
  )

Method 2: wrap everything in a function

workflow <- function(df) {
  cols <- names(df)
  cols <- cols[starts_with("col_", vars = cols)]
  # or cols <- cols[startsWith(cols, "col_")]
  # or cols <- cols[grepl("^col_", cols)]
  # ...
  df %>%
    rowwise() %>%
    mutate(
      sampled = sample(c_across(!!cols), 1L),
      original_col = toString(cols[which(c_across(!!cols) %in% sampled)]),
    )
}

I prefer the second method as it is more flexible.

ekoam
  • 8,744
  • 1
  • 9
  • 22
  • Yes. This is neat. – Emman Nov 10 '20 at 11:59
  • Could you think of a version of your solution, just with a selecting helper function (such as `starts_with()`) rather than explicitly writing all variable names? Something like `starts_with("col_")`. – Emman Nov 10 '20 at 12:26