2

I'm looking for a tidy way to anonymize selected columns of a data frame.

The best I could come up with is to define a mapping table and then using plyr::mapvalues(), but I can't wrap my head around generalizing this to make it work with in conjunction with dplyr::mutate_at() (see pseudo code below).

Or would this best be done via purrr::map2()?

library(magrittr)
df <- tibble::tribble(
  ~name,  ~surname, ~value,
  "John", "Doe",    10,
  "Jane", "Doe",    20
)

seed <- 2093
cols_to_anon <- c("name", "surname")
recode_table <- cols_to_anon %>%
  dplyr::syms() %>%
  purrr::map(function(.x) {
    uniques <- df %>%
      dplyr::distinct(!!.x) %>%
      dplyr::pull()
    n <- length(uniques)

    set.seed(seed)
    original <- uniques[sample(1:n)]

    set.seed(seed)
    anon_1 <- sample(LETTERS, n, replace = TRUE)
    set.seed(seed)
    anon_2 <- sample(1:1000, n, replace = TRUE)

    anon <- stringr::str_glue("{anon_1}{anon_2}")
    tibble::tibble(original, anon)
  }) %>%
  purrr::set_names(cols_to_anon)
recode_table
#> $name
#> # A tibble: 2 x 2
#>   original anon      
#>   <chr>    <S3: glue>
#> 1 Jane     W875      
#> 2 John     D149      
#> 
#> $surname
#> # A tibble: 1 x 2
#>   original anon      
#>   <chr>    <S3: glue>
#> 1 Doe      W875

df_anon <- df %>%
  dplyr::mutate(
    name = plyr::mapvalues(name,
      recode_table$name$original,
      recode_table$name$anon
    ),
    surname = plyr::mapvalues(surname,
      recode_table$surname$original,
      recode_table$surname$anon
    )
  )
df_anon
#> # A tibble: 2 x 3
#>   name  surname value
#>   <chr> <chr>   <dbl>
#> 1 D149  W875       10
#> 2 W875  W875       20

Created on 2019-05-16 by the reprex package (v0.2.1.9000)

PSEUDO CODE OF "DESIRED" SOLUTION

df_anon <- df %>%
  dplyr::mutate_at(
    dplyr::vars(one_of(cols_to_anon)),
    ~plyr::mapvalues(<col_name_i>,
      mtable_list[[<col_name_i>]]$original,
      mtable_list[[<col_name_i>]]$anon
    )
  )

with `<col_name_i>` being the name of the respective column that is to be anonymized
Rappster
  • 12,762
  • 7
  • 71
  • 120
  • I don't totally get what you're trying to do. You want a function to generate a code for each first or last name? – camille May 16 '19 at 12:29
  • Maybe use [digest library](https://stackoverflow.com/q/21686645/680068) ? – zx8754 May 16 '19 at 12:29
  • Related post: https://stackoverflow.com/a/10455729/680068 – zx8754 May 16 '19 at 12:34
  • @camille yes, correct. – Rappster May 16 '19 at 12:34
  • @zx8754 yep, I could simply use digest for it, but I'm looking for the general pattern behind mapping one value to another in a tidy manner – Rappster May 16 '19 at 12:35
  • From 2nd link: `myData$myAnonSurname <- paste0("Surname_", as.numeric(as.factor(myData$surname)))` – zx8754 May 16 '19 at 12:38
  • I don't really know what `plyr::mapvalues` does—I just commented on another post that `plyr` was retired a while back in favor of `dplyr`. I think you could take some of the functions from the link @zx8754 posted, and just use that in `dplyr::mutate_at` across the variables you want to anonymize – camille May 16 '19 at 12:39
  • Yep, that's sounds like a good direction to go. Thanks for the pointer @zx8754 – Rappster May 16 '19 at 12:44

1 Answers1

2

One approach would be:

library(rlang)
library(stringr)
library(tidyverse)

df <- tibble::tribble(
  ~name,  ~surname, ~value,
  "John", "Doe",    10,
  "Jane", "Doe",    20
)
df

my_selection <- exprs(name, surname)

map(df %>%
      select(!!!my_selection),
    ~enframe(unique(.), name = NULL, value = "original") %>%
      mutate(anon = str_c(sample(LETTERS, n(), replace = TRUE),
                          sample(1:1000, n(), replace = TRUE),
                          sep = ""))) -> recode_table
recode_table
# $name
# # A tibble: 2 x 2
# original anon 
# <chr>    <chr>
#   1 John     F330 
# 2 Jane     O445 
# 
# $surname
# # A tibble: 1 x 2
# original anon 
# <chr>    <chr>
#   1 Doe      N710 

imap_dfc(recode_table,
     ~df %>% 
       select(..2) %>%
       `colnames<-`("original") %>%
       left_join(recode_table[[..2]], by = "original") %>%
       select(-original) %>%
       `colnames<-`(..2)) %>%
  cbind(
    df %>%
          select(-c(!!!my_selection))) -> df_anon
df_anon
# name surname value
# 1 F330    N710    10
# 2 O445    N710    20 
r.user.05apr
  • 5,356
  • 3
  • 22
  • 39
  • Nice! I like the `purrr` approach, just could not work out the join part before and also never used the `imap` family before. Thanks a lot – Rappster May 16 '19 at 19:58