I'm looking for a tidy way to anonymize selected columns of a data frame.
The best I could come up with is to define a mapping table and then using plyr::mapvalues()
, but I can't wrap my head around generalizing this to make it work with in conjunction with dplyr::mutate_at()
(see pseudo code below).
Or would this best be done via purrr::map2()
?
library(magrittr)
df <- tibble::tribble(
~name, ~surname, ~value,
"John", "Doe", 10,
"Jane", "Doe", 20
)
seed <- 2093
cols_to_anon <- c("name", "surname")
recode_table <- cols_to_anon %>%
dplyr::syms() %>%
purrr::map(function(.x) {
uniques <- df %>%
dplyr::distinct(!!.x) %>%
dplyr::pull()
n <- length(uniques)
set.seed(seed)
original <- uniques[sample(1:n)]
set.seed(seed)
anon_1 <- sample(LETTERS, n, replace = TRUE)
set.seed(seed)
anon_2 <- sample(1:1000, n, replace = TRUE)
anon <- stringr::str_glue("{anon_1}{anon_2}")
tibble::tibble(original, anon)
}) %>%
purrr::set_names(cols_to_anon)
recode_table
#> $name
#> # A tibble: 2 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Jane W875
#> 2 John D149
#>
#> $surname
#> # A tibble: 1 x 2
#> original anon
#> <chr> <S3: glue>
#> 1 Doe W875
df_anon <- df %>%
dplyr::mutate(
name = plyr::mapvalues(name,
recode_table$name$original,
recode_table$name$anon
),
surname = plyr::mapvalues(surname,
recode_table$surname$original,
recode_table$surname$anon
)
)
df_anon
#> # A tibble: 2 x 3
#> name surname value
#> <chr> <chr> <dbl>
#> 1 D149 W875 10
#> 2 W875 W875 20
Created on 2019-05-16 by the reprex package (v0.2.1.9000)
PSEUDO CODE OF "DESIRED" SOLUTION
df_anon <- df %>%
dplyr::mutate_at(
dplyr::vars(one_of(cols_to_anon)),
~plyr::mapvalues(<col_name_i>,
mtable_list[[<col_name_i>]]$original,
mtable_list[[<col_name_i>]]$anon
)
)
with `<col_name_i>` being the name of the respective column that is to be anonymized